md2bb.pl 30 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154115511561157115811591160116111621163116411651166116711681169117011711172117311741175117611771178117911801181118211831184118511861187118811891190119111921193119411951196119711981199120012011202120312041205120612071208120912101211121212131214121512161217121812191220122112221223122412251226122712281229123012311232123312341235123612371238123912401241124212431244124512461247124812491250125112521253125412551256125712581259126012611262126312641265126612671268126912701271127212731274127512761277127812791280128112821283128412851286
  1. #!/usr/bin/perl
  2. #
  3. # Markdown -- A text-to-HTML conversion tool for web writers
  4. #
  5. # Copyright (c) 2004 John Gruber
  6. # <http://daringfireball.net/projects/markdown/>
  7. #
  8. package Markdown2bbcode;
  9. require 5.006_000;
  10. use strict;
  11. use warnings;
  12. use Digest::MD5 qw(md5_hex);
  13. use vars qw($VERSION);
  14. $VERSION = '1.0.1';
  15. # Tue 14 Dec 2004
  16. ## Disabled; causes problems under Perl 5.6.1:
  17. # use utf8;
  18. # binmode( STDOUT, ":utf8" ); # c.f.: http://acis.openlib.org/dev/perl-unicode-struggle.html
  19. #
  20. # Global default settings:
  21. #
  22. my $g_tab_width = 4;
  23. #
  24. # Globals:
  25. #
  26. # Regex to match balanced [brackets]. See Friedl's
  27. # "Mastering Regular Expressions", 2nd Ed., pp. 328-331.
  28. my $g_nested_brackets;
  29. $g_nested_brackets = qr{
  30. (?> # Atomic matching
  31. [^\[\]]+ # Anything other than brackets
  32. |
  33. \[
  34. (??{ $g_nested_brackets }) # Recursive set of nested brackets
  35. \]
  36. )*
  37. }x;
  38. # Table of hash values for escaped characters:
  39. my %g_escape_table;
  40. foreach my $char (split //, '\\`*_{}[]()>#+-.!') {
  41. $g_escape_table{$char} = md5_hex($char);
  42. }
  43. # Global hashes, used by various utility routines
  44. my %g_urls;
  45. my %g_titles;
  46. my %g_html_blocks;
  47. # Used to track when we're inside an ordered or unordered list
  48. # (see _ProcessListItems() for details):
  49. my $g_list_level = 0;
  50. sub start { 1; }
  51. sub story {
  52. my($pkg, $path, $filename, $story_ref, $title_ref, $body_ref) = @_;
  53. if ( (defined($meta::markup) and ($meta::markup =~ /^\s*markdown\s*$/i))
  54. ){
  55. $$body_ref = Markdown($$body_ref);
  56. }
  57. 1;
  58. }
  59. #############################################################################
  60. no warnings 'once';
  61. use warnings;
  62. #### Check for command-line switches: #################
  63. my %cli_opts;
  64. use Getopt::Long;
  65. Getopt::Long::Configure('pass_through');
  66. GetOptions(\%cli_opts,
  67. 'version',
  68. 'shortversion',
  69. 'html4tags',
  70. );
  71. if ($cli_opts{'version'}) { # Version info
  72. print "\nThis is Markdown, version $VERSION.\n";
  73. print "Copyright 2004 John Gruber\n";
  74. print "http://daringfireball.net/projects/markdown/\n\n";
  75. exit 0;
  76. }
  77. if ($cli_opts{'shortversion'}) { # Just the version number string.
  78. print $VERSION;
  79. exit 0;
  80. }
  81. #### Process incoming text: ###########################
  82. my $text;
  83. {
  84. local $/; # Slurp the whole file
  85. $text = <>;
  86. }
  87. print Markdown($text);
  88. ##############################################################################
  89. sub Markdown {
  90. #
  91. # Main function. The order in which other subs are called here is
  92. # essential. Link and image substitutions need to happen before
  93. # _EscapeSpecialChars(), so that any *'s or _'s in the <a>
  94. # and <img> tags get encoded.
  95. #
  96. my $text = shift;
  97. # Clear the global hashes. If we don't clear these, you get conflicts
  98. # from other articles when generating a page which contains more than
  99. # one article (e.g. an index page that shows the N most recent
  100. # articles):
  101. %g_urls = ();
  102. %g_titles = ();
  103. %g_html_blocks = ();
  104. # Standardize line endings:
  105. $text =~ s{\r\n}{\n}g; # DOS to Unix
  106. $text =~ s{\r}{\n}g; # Mac to Unix
  107. # Make sure $text ends with a couple of newlines:
  108. $text .= "\n\n";
  109. # Convert all tabs to spaces.
  110. $text = _Detab($text);
  111. # Strip any lines consisting only of spaces and tabs.
  112. # This makes subsequent regexen easier to write, because we can
  113. # match consecutive blank lines with /\n+/ instead of something
  114. # contorted like /[ \t]*\n+/ .
  115. $text =~ s/^[ \t]+$//mg;
  116. # Turn block-level HTML blocks into hash entries
  117. $text = _HashHTMLBlocks($text);
  118. # Strip link definitions, store in hashes.
  119. $text = _StripLinkDefinitions($text);
  120. $text = _RunBlockGamut($text);
  121. $text = _UnescapeSpecialChars($text);
  122. return $text . "\n";
  123. }
  124. sub _StripLinkDefinitions {
  125. #
  126. # Strips link definitions from text, stores the URLs and titles in
  127. # hash references.
  128. #
  129. my $text = shift;
  130. my $less_than_tab = $g_tab_width - 1;
  131. # Link defs are in the form: ^[id]: url "optional title"
  132. while ($text =~ s{
  133. ^[ ]{0,$less_than_tab}\[(.+)\]: # id = $1
  134. [ \t]*
  135. \n? # maybe *one* newline
  136. [ \t]*
  137. <?(\S+?)>? # url = $2
  138. [ \t]*
  139. \n? # maybe one newline
  140. [ \t]*
  141. (?:
  142. (?<=\s) # lookbehind for whitespace
  143. ["(]
  144. (.+?) # title = $3
  145. [")]
  146. [ \t]*
  147. )? # title is optional
  148. (?:\n+|\Z)
  149. }
  150. {}mx) {
  151. #~ $g_urls{lc $1} = _EncodeAmpsAndAngles( $2 ); # Link IDs are case-insensitive
  152. $g_urls{lc $1} = $2; # Link IDs are case-insensitive
  153. if ($3) {
  154. $g_titles{lc $1} = $3;
  155. $g_titles{lc $1} =~ s/"/&quot;/g;
  156. }
  157. }
  158. return $text;
  159. }
  160. sub _HashHTMLBlocks {
  161. my $text = shift;
  162. my $less_than_tab = $g_tab_width - 1;
  163. # Hashify HTML blocks:
  164. # We only want to do this for block-level HTML tags, such as headers,
  165. # lists, and tables. That's because we still want to wrap <p>s around
  166. # "paragraphs" that are wrapped in non-block-level tags, such as anchors,
  167. # phrase emphasis, and spans. The list of tags we're looking for is
  168. # hard-coded:
  169. my $block_tags_a = qr/p|div|h[1-6]|blockquote|pre|table|dl|ol|ul|script|noscript|form|fieldset|iframe|math|ins|del/;
  170. my $block_tags_b = qr/p|div|h[1-6]|blockquote|pre|table|dl|ol|ul|script|noscript|form|fieldset|iframe|math/;
  171. # First, look for nested blocks, e.g.:
  172. # <div>
  173. # <div>
  174. # tags for inner block must be indented.
  175. # </div>
  176. # </div>
  177. #
  178. # The outermost tags must start at the left margin for this to match, and
  179. # the inner nested divs must be indented.
  180. # We need to do this before the next, more liberal match, because the next
  181. # match will start at the first `<div>` and stop at the first `</div>`.
  182. $text =~ s{
  183. ( # save in $1
  184. ^ # start of line (with /m)
  185. <($block_tags_a) # start tag = $2
  186. \b # word break
  187. (.*\n)*? # any number of lines, minimally matching
  188. </\2> # the matching end tag
  189. [ \t]* # trailing spaces/tabs
  190. (?=\n+|\Z) # followed by a newline or end of document
  191. )
  192. }{
  193. my $key = md5_hex($1);
  194. $g_html_blocks{$key} = $1;
  195. "\n\n" . $key . "\n\n";
  196. }egmx;
  197. #
  198. # Now match more liberally, simply from `\n<tag>` to `</tag>\n`
  199. #
  200. $text =~ s{
  201. ( # save in $1
  202. ^ # start of line (with /m)
  203. <($block_tags_b) # start tag = $2
  204. \b # word break
  205. (.*\n)*? # any number of lines, minimally matching
  206. .*</\2> # the matching end tag
  207. [ \t]* # trailing spaces/tabs
  208. (?=\n+|\Z) # followed by a newline or end of document
  209. )
  210. }{
  211. my $key = md5_hex($1);
  212. $g_html_blocks{$key} = $1;
  213. "\n\n" . $key . "\n\n";
  214. }egmx;
  215. # Special case just for <hr />. It was easier to make a special case than
  216. # to make the other regex more complicated.
  217. $text =~ s{
  218. (?:
  219. (?<=\n\n) # Starting after a blank line
  220. | # or
  221. \A\n? # the beginning of the doc
  222. )
  223. ( # save in $1
  224. [ ]{0,$less_than_tab}
  225. <(hr) # start tag = $2
  226. \b # word break
  227. ([^<>])*? #
  228. /?> # the matching end tag
  229. [ \t]*
  230. (?=\n{2,}|\Z) # followed by a blank line or end of document
  231. )
  232. }{
  233. my $key = md5_hex($1);
  234. $g_html_blocks{$key} = $1;
  235. "\n\n" . $key . "\n\n";
  236. }egx;
  237. # Special case for standalone HTML comments:
  238. $text =~ s{
  239. (?:
  240. (?<=\n\n) # Starting after a blank line
  241. | # or
  242. \A\n? # the beginning of the doc
  243. )
  244. ( # save in $1
  245. [ ]{0,$less_than_tab}
  246. (?s:
  247. <!
  248. (--.*?--\s*)+
  249. >
  250. )
  251. [ \t]*
  252. (?=\n{2,}|\Z) # followed by a blank line or end of document
  253. )
  254. }{
  255. my $key = md5_hex($1);
  256. $g_html_blocks{$key} = $1;
  257. "\n\n" . $key . "\n\n";
  258. }egx;
  259. return $text;
  260. }
  261. sub _RunBlockGamut {
  262. #
  263. # These are all the transformations that form block-level
  264. # tags like paragraphs, headers, and list items.
  265. #
  266. my $text = shift;
  267. $text = _DoHeaders($text);
  268. $text = _DoLists($text);
  269. $text = _DoCodeBlocks($text);
  270. $text = _DoBlockQuotes($text);
  271. # We already ran _HashHTMLBlocks() before, in Markdown(), but that
  272. # was to escape raw HTML in the original Markdown source. This time,
  273. # we're escaping the markup we've just created, so that we don't wrap
  274. # <p> tags around block-level tags.
  275. $text = _HashHTMLBlocks($text);
  276. $text = _FormParagraphs($text);
  277. return $text;
  278. }
  279. sub _RunSpanGamut {
  280. #
  281. # These are all the transformations that occur *within* block-level
  282. # tags like paragraphs, headers, and list items.
  283. #
  284. my $text = shift;
  285. $text = _DoCodeSpans($text);
  286. $text = _EscapeSpecialChars($text);
  287. # Process anchor and image tags. Images must come first,
  288. # because ![foo][f] looks like an anchor.
  289. $text = _DoImages($text);
  290. $text = _DoAnchors($text);
  291. # Make links out of things like `<http://example.com/>`
  292. # Must come after _DoAnchors(), because you can use < and >
  293. # delimiters in inline links like [this](<url>).
  294. $text = _DoAutoLinks($text);
  295. #~ $text = _EncodeAmpsAndAngles($text);
  296. $text = _DoItalicsAndBold($text);
  297. # Do hard breaks:
  298. $text =~ s/ {2,}\n/ \n/g;
  299. return $text;
  300. }
  301. sub _EscapeSpecialChars {
  302. my $text = shift;
  303. my $tokens ||= _TokenizeHTML($text);
  304. $text = ''; # rebuild $text from the tokens
  305. # my $in_pre = 0; # Keep track of when we're inside <pre> or <code> tags.
  306. # my $tags_to_skip = qr!<(/?)(?:pre|code|kbd|script|math)[\s>]!;
  307. foreach my $cur_token (@$tokens) {
  308. if ($cur_token->[0] eq "tag") {
  309. # Within tags, encode * and _ so they don't conflict
  310. # with their use in Markdown for italics and strong.
  311. # We're replacing each such character with its
  312. # corresponding MD5 checksum value; this is likely
  313. # overkill, but it should prevent us from colliding
  314. # with the escape values by accident.
  315. $cur_token->[1] =~ s! \* !$g_escape_table{'*'}!gx;
  316. $cur_token->[1] =~ s! _ !$g_escape_table{'_'}!gx;
  317. $text .= $cur_token->[1];
  318. } else {
  319. my $t = $cur_token->[1];
  320. $t = _EncodeBackslashEscapes($t);
  321. $text .= $t;
  322. }
  323. }
  324. return $text;
  325. }
  326. sub _DoAnchors {
  327. #
  328. # Turn Markdown link shortcuts into XHTML <a> tags.
  329. #
  330. my $text = shift;
  331. #
  332. # First, handle reference-style links: [link text] [id]
  333. #
  334. $text =~ s{
  335. ( # wrap whole match in $1
  336. \[
  337. ($g_nested_brackets) # link text = $2
  338. \]
  339. [ ]? # one optional space
  340. (?:\n[ ]*)? # one optional newline followed by spaces
  341. \[
  342. (.*?) # id = $3
  343. \]
  344. )
  345. }{
  346. my $result;
  347. my $whole_match = $1;
  348. my $link_text = $2;
  349. my $link_id = lc $3;
  350. if ($link_id eq "") {
  351. $link_id = lc $link_text; # for shortcut links like [this][].
  352. }
  353. if (defined $g_urls{$link_id}) {
  354. my $url = $g_urls{$link_id};
  355. $url =~ s! \* !$g_escape_table{'*'}!gx; # We've got to encode these to avoid
  356. $url =~ s! _ !$g_escape_table{'_'}!gx; # conflicting with italics/bold.
  357. $result = "\[url=$url\]$link_text\[/url\]";
  358. }
  359. else {
  360. $result = $whole_match;
  361. }
  362. $result;
  363. }xsge;
  364. #
  365. # Next, inline-style links: [link text](url "optional title")
  366. #
  367. $text =~ s{
  368. ( # wrap whole match in $1
  369. \[
  370. ($g_nested_brackets) # link text = $2
  371. \]
  372. \( # literal paren
  373. [ \t]*
  374. <?(.*?)>? # href = $3
  375. [ \t]*
  376. ( # $4
  377. (['"]) # quote char = $5
  378. (.*?) # Title = $6
  379. \5 # matching quote
  380. )? # title is optional
  381. \)
  382. )
  383. }{
  384. my $result;
  385. my $whole_match = $1;
  386. my $link_text = $2;
  387. my $url = $3;
  388. $url =~ s! \* !$g_escape_table{'*'}!gx; # We've got to encode these to avoid
  389. $url =~ s! _ !$g_escape_table{'_'}!gx; # conflicting with italics/bold.
  390. $result = "\[url=$url\]$link_text\[/url\]";
  391. $result;
  392. }xsge;
  393. return $text;
  394. }
  395. sub _DoImages {
  396. #
  397. # Turn Markdown image shortcuts into <img> tags.
  398. #
  399. my $text = shift;
  400. #
  401. # First, handle reference-style labeled images: ![alt text][id]
  402. #
  403. $text =~ s{
  404. ( # wrap whole match in $1
  405. !\[
  406. (.*?) # alt text = $2
  407. \]
  408. [ ]? # one optional space
  409. (?:\n[ ]*)? # one optional newline followed by spaces
  410. \[
  411. (.*?) # id = $3
  412. \]
  413. )
  414. }{
  415. my $result;
  416. my $whole_match = $1;
  417. my $alt_text = $2;
  418. my $link_id = lc $3;
  419. if ($link_id eq "") {
  420. $link_id = lc $alt_text; # for shortcut links like ![this][].
  421. }
  422. $alt_text =~ s/"/&quot;/g;
  423. if (defined $g_urls{$link_id}) {
  424. my $url = $g_urls{$link_id};
  425. $url =~ s! \* !$g_escape_table{'*'}!gx; # We've got to encode these to avoid
  426. $url =~ s! _ !$g_escape_table{'_'}!gx; # conflicting with italics/bold.
  427. $result = "\[img=\"$alt_text\"\]$url\[/img\]";
  428. }
  429. else {
  430. # If there's no such link ID, leave intact:
  431. $result = $whole_match;
  432. }
  433. $result;
  434. }xsge;
  435. #
  436. # Next, handle inline images: ![alt text](url "optional title")
  437. # Don't forget: encode * and _
  438. $text =~ s{
  439. ( # wrap whole match in $1
  440. !\[
  441. (.*?) # alt text = $2
  442. \]
  443. \( # literal paren
  444. [ \t]*
  445. <?(\S+?)>? # src url = $3
  446. [ \t]*
  447. ( # $4
  448. (['"]) # quote char = $5
  449. (.*?) # title = $6
  450. \5 # matching quote
  451. [ \t]*
  452. )? # title is optional
  453. \)
  454. )
  455. }{
  456. my $result;
  457. my $whole_match = $1;
  458. my $alt_text = $2;
  459. my $url = $3;
  460. $alt_text =~ s/"/&quot;/g;
  461. $url =~ s! \* !$g_escape_table{'*'}!gx; # We've got to encode these to avoid
  462. $url =~ s! _ !$g_escape_table{'_'}!gx; # conflicting with italics/bold.
  463. $result = "\[img=\"$alt_text\"\]$url\[/img\]";
  464. $result;
  465. }xsge;
  466. return $text;
  467. }
  468. sub _DoHeaders {
  469. my $text = shift;
  470. # Setext-style headers:
  471. # Header 1
  472. # ========
  473. #
  474. # Header 2
  475. # --------
  476. #
  477. $text =~ s{ ^(.+)[ \t]*\n=+[ \t]*\n+ }{
  478. "[h]" . _RunSpanGamut($1) . "[/h]\n\n";
  479. }egmx;
  480. $text =~ s{ ^(.+)[ \t]*\n-+[ \t]*\n+ }{
  481. "[h]" . _RunSpanGamut($1) . "[/h]\n\n";
  482. }egmx;
  483. # atx-style headers:
  484. # # Header 1
  485. # ## Header 2
  486. # ## Header 2 with closing hashes ##
  487. # ...
  488. # ###### Header 6
  489. #
  490. $text =~ s{
  491. ^(\#{1,6}) # $1 = string of #'s
  492. [ \t]*
  493. (.+?) # $2 = Header text
  494. [ \t]*
  495. \#* # optional closing #'s (not counted)
  496. \n+
  497. }{
  498. "[h]" . _RunSpanGamut($2) . "[/h]\n\n";
  499. }egmx;
  500. return $text;
  501. }
  502. sub _DoLists {
  503. #
  504. # Form HTML ordered (numbered) and unordered (bulleted) lists.
  505. #
  506. my $text = shift;
  507. my $less_than_tab = $g_tab_width - 1;
  508. # Re-usable patterns to match list item bullets and number markers:
  509. my $marker_ul = qr/[*+-]/;
  510. my $marker_ol = qr/\d+[.]/;
  511. my $marker_any = qr/(?:$marker_ul|$marker_ol)/;
  512. # Re-usable pattern to match any entirel ul or ol list:
  513. my $whole_list = qr{
  514. ( # $1 = whole list
  515. ( # $2
  516. [ ]{0,$less_than_tab}
  517. (${marker_any}) # $3 = first list item marker
  518. [ \t]+
  519. )
  520. (?s:.+?)
  521. ( # $4
  522. \z
  523. |
  524. \n{2,}
  525. (?=\S)
  526. (?! # Negative lookahead for another list item marker
  527. [ \t]*
  528. ${marker_any}[ \t]+
  529. )
  530. )
  531. )
  532. }mx;
  533. # We use a different prefix before nested lists than top-level lists.
  534. # See extended comment in _ProcessListItems().
  535. #
  536. # Note: There's a bit of duplication here. My original implementation
  537. # created a scalar regex pattern as the conditional result of the test on
  538. # $g_list_level, and then only ran the $text =~ s{...}{...}egmx
  539. # substitution once, using the scalar as the pattern. This worked,
  540. # everywhere except when running under MT on my hosting account at Pair
  541. # Networks. There, this caused all rebuilds to be killed by the reaper (or
  542. # perhaps they crashed, but that seems incredibly unlikely given that the
  543. # same script on the same server ran fine *except* under MT. I've spent
  544. # more time trying to figure out why this is happening than I'd like to
  545. # admit. My only guess, backed up by the fact that this workaround works,
  546. # is that Perl optimizes the substition when it can figure out that the
  547. # pattern will never change, and when this optimization isn't on, we run
  548. # afoul of the reaper. Thus, the slightly redundant code to that uses two
  549. # static s/// patterns rather than one conditional pattern.
  550. if ($g_list_level) {
  551. $text =~ s{
  552. ^
  553. $whole_list
  554. }{
  555. my $list = $1;
  556. my $list_type = ($3 =~ m/$marker_ul/) ? "ul" : "ol";
  557. # Turn double returns into triple returns, so that we can make a
  558. # paragraph for the last item in a list, if necessary:
  559. $list =~ s/\n{2,}/\n\n\n/g;
  560. my $result = _ProcessListItems($list, $marker_any);
  561. $result = "<$list_type>\n" . $result . "</$list_type>\n";
  562. $result;
  563. }egmx;
  564. }
  565. else {
  566. $text =~ s{
  567. (?:(?<=\n\n)|\A\n?)
  568. $whole_list
  569. }{
  570. my $list = $1;
  571. my $list_type = ($3 =~ m/$marker_ul/) ? "list" : "list=1";
  572. # Turn double returns into triple returns, so that we can make a
  573. # paragraph for the last item in a list, if necessary:
  574. $list =~ s/\n{2,}/\n\n\n/g;
  575. my $result = _ProcessListItems($list, $marker_any);
  576. $result = "[$list_type]\n" . $result . "[/list]\n";
  577. $result;
  578. }egmx;
  579. }
  580. return $text;
  581. }
  582. sub _ProcessListItems {
  583. #
  584. # Process the contents of a single ordered or unordered list, splitting it
  585. # into individual list items.
  586. #
  587. my $list_str = shift;
  588. my $marker_any = shift;
  589. # The $g_list_level global keeps track of when we're inside a list.
  590. # Each time we enter a list, we increment it; when we leave a list,
  591. # we decrement. If it's zero, we're not in a list anymore.
  592. #
  593. # We do this because when we're not inside a list, we want to treat
  594. # something like this:
  595. #
  596. # I recommend upgrading to version
  597. # 8. Oops, now this line is treated
  598. # as a sub-list.
  599. #
  600. # As a single paragraph, despite the fact that the second line starts
  601. # with a digit-period-space sequence.
  602. #
  603. # Whereas when we're inside a list (or sub-list), that line will be
  604. # treated as the start of a sub-list. What a kludge, huh? This is
  605. # an aspect of Markdown's syntax that's hard to parse perfectly
  606. # without resorting to mind-reading. Perhaps the solution is to
  607. # change the syntax rules such that sub-lists must start with a
  608. # starting cardinal number; e.g. "1." or "a.".
  609. $g_list_level++;
  610. # trim trailing blank lines:
  611. $list_str =~ s/\n{2,}\z/\n/;
  612. $list_str =~ s{
  613. (\n)? # leading line = $1
  614. (^[ \t]*) # leading whitespace = $2
  615. ($marker_any) [ \t]+ # list marker = $3
  616. ((?s:.+?) # list item text = $4
  617. (\n{1,2}))
  618. (?= \n* (\z | \2 ($marker_any) [ \t]+))
  619. }{
  620. my $item = $4;
  621. my $leading_line = $1;
  622. my $leading_space = $2;
  623. if ($leading_line or ($item =~ m/\n{2,}/)) {
  624. $item = _RunBlockGamut(_Outdent($item));
  625. }
  626. else {
  627. # Recursion for sub-lists:
  628. $item = _DoLists(_Outdent($item));
  629. chomp $item;
  630. $item = _RunSpanGamut($item);
  631. }
  632. "[\\*]" . $item . "\n";
  633. }egmx;
  634. #~ "[\\*]" . $item . "[/\\*]\n";
  635. $g_list_level--;
  636. return $list_str;
  637. }
  638. sub _DoCodeBlocks {
  639. #
  640. # Process Markdown `<pre><code>` blocks.
  641. #
  642. my $text = shift;
  643. $text =~ s{
  644. (?:\n\n|\A)
  645. ( # $1 = the code block -- one or more lines, starting with a space/tab
  646. (?:
  647. (?:[ ]{$g_tab_width} | \t) # Lines must start with a tab or a tab-width of spaces
  648. .*\n+
  649. )+
  650. )
  651. ((?=^[ ]{0,$g_tab_width}\S)|\Z) # Lookahead for non-space at line-start, or end of doc
  652. }{
  653. my $codeblock = $1;
  654. my $result; # return value
  655. $codeblock = _EncodeCode(_Outdent($codeblock));
  656. $codeblock = _Detab($codeblock);
  657. $codeblock =~ s/\A\n+//; # trim leading newlines
  658. $codeblock =~ s/\s+\z//; # trim trailing whitespace
  659. $result = "\n\n[code]" . $codeblock . "\n[/code]\n\n";
  660. $result;
  661. }egmx;
  662. return $text;
  663. }
  664. sub _DoCodeSpans {
  665. #
  666. # * Backtick quotes are used for <code></code> spans.
  667. #
  668. # * You can use multiple backticks as the delimiters if you want to
  669. # include literal backticks in the code span. So, this input:
  670. #
  671. # Just type ``foo `bar` baz`` at the prompt.
  672. #
  673. # Will translate to:
  674. #
  675. # <p>Just type <code>foo `bar` baz</code> at the prompt.</p>
  676. #
  677. # There's no arbitrary limit to the number of backticks you
  678. # can use as delimters. If you need three consecutive backticks
  679. # in your code, use four for delimiters, etc.
  680. #
  681. # * You can use spaces to get literal backticks at the edges:
  682. #
  683. # ... type `` `bar` `` ...
  684. #
  685. # Turns to:
  686. #
  687. # ... type <code>`bar`</code> ...
  688. #
  689. my $text = shift;
  690. $text =~ s@
  691. (`+) # $1 = Opening run of `
  692. (.+?) # $2 = The code block
  693. (?<!`)
  694. \1 # Matching closer
  695. (?!`)
  696. @
  697. my $c = "$2";
  698. $c =~ s/^[ \t]*//g; # leading whitespace
  699. $c =~ s/[ \t]*$//g; # trailing whitespace
  700. $c = _EncodeCode($c);
  701. "\[code\]$c\[/code\]";
  702. @egsx;
  703. return $text;
  704. }
  705. sub _EncodeCode {
  706. #
  707. # Encode/escape certain characters inside Markdown code runs.
  708. # The point is that in code, these characters are literals,
  709. # and lose their special Markdown meanings.
  710. #
  711. local $_ = shift;
  712. # Encode all ampersands; HTML entities are not
  713. # entities within a Markdown code span.
  714. s/&/&amp;/g;
  715. # Do the angle bracket song and dance:
  716. s! < !&lt;!gx;
  717. s! > !&gt;!gx;
  718. # Now, escape characters that are magic in Markdown:
  719. s! \* !$g_escape_table{'*'}!gx;
  720. s! _ !$g_escape_table{'_'}!gx;
  721. s! { !$g_escape_table{'{'}!gx;
  722. s! } !$g_escape_table{'}'}!gx;
  723. s! \[ !$g_escape_table{'['}!gx;
  724. s! \] !$g_escape_table{']'}!gx;
  725. s! \\ !$g_escape_table{'\\'}!gx;
  726. return $_;
  727. }
  728. sub _DoItalicsAndBold {
  729. my $text = shift;
  730. # <strong> must go first:
  731. $text =~ s{ (\*\*|__) (?=\S) (.+?[*_]*) (?<=\S) \1 }
  732. {\[b\]$2\[/b\]}gsx;
  733. $text =~ s{ (\*|_) (?=\S) (.+?) (?<=\S) \1 }
  734. {\[i\]$2\[/i\]}gsx;
  735. return $text;
  736. }
  737. sub _DoBlockQuotes {
  738. my $text = shift;
  739. $text =~ s{
  740. ( # Wrap whole match in $1
  741. (
  742. ^[ \t]*>[ \t]? # '>' at the start of a line
  743. .+\n # rest of the first line
  744. (.+\n)* # subsequent consecutive lines
  745. \n* # blanks
  746. )+
  747. )
  748. }{
  749. my $bq = $1;
  750. $bq =~ s/^[ \t]*>[ \t]?//gm; # trim one level of quoting
  751. $bq =~ s/^[ \t]+$//mg; # trim whitespace-only lines
  752. $bq = _RunBlockGamut($bq); # recurse
  753. $bq =~ s/^/ /g;
  754. # These leading spaces screw with <pre> content, so we need to fix that:
  755. $bq =~ s{
  756. (\s*<pre>.+?</pre>)
  757. }{
  758. my $pre = $1;
  759. $pre =~ s/^ //mg;
  760. $pre;
  761. }egsx;
  762. "[quote]\n$bq\n[/quote]\n\n";
  763. }egmx;
  764. return $text;
  765. }
  766. sub _FormParagraphs {
  767. #
  768. # Params:
  769. # $text - string to process with html <p> tags
  770. #
  771. my $text = shift;
  772. # Strip leading and trailing lines:
  773. $text =~ s/\A\n+//;
  774. $text =~ s/\n+\z//;
  775. my @grafs = split(/\n{2,}/, $text);
  776. #
  777. # Wrap <p> tags.
  778. #
  779. foreach (@grafs) {
  780. unless (defined( $g_html_blocks{$_} )) {
  781. $_ = _RunSpanGamut($_);
  782. s/^([ \t]*)//;
  783. $_ .= "";
  784. }
  785. }
  786. #
  787. # Unhashify HTML blocks
  788. #
  789. foreach (@grafs) {
  790. if (defined( $g_html_blocks{$_} )) {
  791. $_ = $g_html_blocks{$_};
  792. }
  793. }
  794. return join "\n\n", @grafs;
  795. }
  796. #~ sub _EncodeAmpsAndAngles {
  797. #~ # Smart processing for ampersands and angle brackets that need to be encoded.
  798. #~ my $text = shift;
  799. #~ # Ampersand-encoding based entirely on Nat Irons's Amputator MT plugin:
  800. #~ # http://bumppo.net/projects/amputator/
  801. #~ $text =~ s/&(?!#?[xX]?(?:[0-9a-fA-F]+|\w+);)/&amp;/g;
  802. #~ # Encode naked <'s
  803. #~ $text =~ s{<(?![a-z/?\$!])}{&lt;}gi;
  804. #~ return $text;
  805. #~ }
  806. sub _EncodeBackslashEscapes {
  807. #
  808. # Parameter: String.
  809. # Returns: The string, with after processing the following backslash
  810. # escape sequences.
  811. #
  812. local $_ = shift;
  813. s! \\\\ !$g_escape_table{'\\'}!gx; # Must process escaped backslashes first.
  814. s! \\` !$g_escape_table{'`'}!gx;
  815. s! \\\* !$g_escape_table{'*'}!gx;
  816. s! \\_ !$g_escape_table{'_'}!gx;
  817. s! \\\{ !$g_escape_table{'{'}!gx;
  818. s! \\\} !$g_escape_table{'}'}!gx;
  819. s! \\\[ !$g_escape_table{'['}!gx;
  820. s! \\\] !$g_escape_table{']'}!gx;
  821. s! \\\( !$g_escape_table{'('}!gx;
  822. s! \\\) !$g_escape_table{')'}!gx;
  823. s! \\> !$g_escape_table{'>'}!gx;
  824. s! \\\# !$g_escape_table{'#'}!gx;
  825. s! \\\+ !$g_escape_table{'+'}!gx;
  826. s! \\\- !$g_escape_table{'-'}!gx;
  827. s! \\\. !$g_escape_table{'.'}!gx;
  828. s{ \\! }{$g_escape_table{'!'}}gx;
  829. return $_;
  830. }
  831. sub _DoAutoLinks {
  832. my $text = shift;
  833. $text =~ s{<((https?|ftp):[^'">\s]+)>}{<a href="$1">$1</a>}gi;
  834. # Email addresses: <address@domain.foo>
  835. #~ $text =~ s{
  836. #~ <
  837. #~ (?:mailto:)?
  838. #~ (
  839. #~ [-.\w]+
  840. #~ \@
  841. #~ [-a-z0-9]+(\.[-a-z0-9]+)*\.[a-z]+
  842. #~ )
  843. #~ >
  844. #~ }{
  845. #~ _EncodeEmailAddress( _UnescapeSpecialChars($1) );
  846. #~ }egix;
  847. return $text;
  848. }
  849. #~ sub _EncodeEmailAddress {
  850. #~ #
  851. #~ # Input: an email address, e.g. "foo@example.com"
  852. #~ #
  853. #~ # Output: the email address as a mailto link, with each character
  854. #~ # of the address encoded as either a decimal or hex entity, in
  855. #~ # the hopes of foiling most address harvesting spam bots. E.g.:
  856. #~ #
  857. #~ # <a href="&#x6D;&#97;&#105;&#108;&#x74;&#111;:&#102;&#111;&#111;&#64;&#101;
  858. #~ # x&#x61;&#109;&#x70;&#108;&#x65;&#x2E;&#99;&#111;&#109;">&#102;&#111;&#111;
  859. #~ # &#64;&#101;x&#x61;&#109;&#x70;&#108;&#x65;&#x2E;&#99;&#111;&#109;</a>
  860. #~ #
  861. #~ # Based on a filter by Matthew Wickline, posted to the BBEdit-Talk
  862. #~ # mailing list: <http://tinyurl.com/yu7ue>
  863. #~ #
  864. #~ my $addr = shift;
  865. #~ srand;
  866. #~ my @encode = (
  867. #~ sub { '&#' . ord(shift) . ';' },
  868. #~ sub { '&#x' . sprintf( "%X", ord(shift) ) . ';' },
  869. #~ sub { shift },
  870. #~ );
  871. #~ $addr = "mailto:" . $addr;
  872. #~ $addr =~ s{(.)}{
  873. #~ my $char = $1;
  874. #~ if ( $char eq '@' ) {
  875. #~ # this *must* be encoded. I insist.
  876. #~ $char = $encode[int rand 1]->($char);
  877. #~ } elsif ( $char ne ':' ) {
  878. #~ # leave ':' alone (to spot mailto: later)
  879. #~ my $r = rand;
  880. #~ # roughly 10% raw, 45% hex, 45% dec
  881. #~ $char = (
  882. #~ $r > .9 ? $encode[2]->($char) :
  883. #~ $r < .45 ? $encode[1]->($char) :
  884. #~ $encode[0]->($char)
  885. #~ );
  886. #~ }
  887. #~ $char;
  888. #~ }gex;
  889. #~ $addr = qq{<a href="$addr">$addr</a>};
  890. #~ $addr =~ s{">.+?:}{">}; # strip the mailto: from the visible part
  891. #~ return $addr;
  892. #~ }
  893. sub _UnescapeSpecialChars {
  894. #
  895. # Swap back in all the special characters we've hidden.
  896. #
  897. my $text = shift;
  898. while( my($char, $hash) = each(%g_escape_table) ) {
  899. $text =~ s/$hash/$char/g;
  900. }
  901. return $text;
  902. }
  903. sub _TokenizeHTML {
  904. #
  905. # Parameter: String containing HTML markup.
  906. # Returns: Reference to an array of the tokens comprising the input
  907. # string. Each token is either a tag (possibly with nested,
  908. # tags contained therein, such as <a href="<MTFoo>">, or a
  909. # run of text between tags. Each element of the array is a
  910. # two-element array; the first is either 'tag' or 'text';
  911. # the second is the actual value.
  912. #
  913. #
  914. # Derived from the _tokenize() subroutine from Brad Choate's MTRegex plugin.
  915. # <http://www.bradchoate.com/past/mtregex.php>
  916. #
  917. my $str = shift;
  918. my $pos = 0;
  919. my $len = length $str;
  920. my @tokens;
  921. my $depth = 6;
  922. my $nested_tags = join('|', ('(?:<[a-z/!$](?:[^<>]') x $depth) . (')*>)' x $depth);
  923. my $match = qr/(?s: <! ( -- .*? -- \s* )+ > ) | # comment
  924. (?s: <\? .*? \?> ) | # processing instruction
  925. $nested_tags/ix; # nested tags
  926. while ($str =~ m/($match)/g) {
  927. my $whole_tag = $1;
  928. my $sec_start = pos $str;
  929. my $tag_start = $sec_start - length $whole_tag;
  930. if ($pos < $tag_start) {
  931. push @tokens, ['text', substr($str, $pos, $tag_start - $pos)];
  932. }
  933. push @tokens, ['tag', $whole_tag];
  934. $pos = pos $str;
  935. }
  936. push @tokens, ['text', substr($str, $pos, $len - $pos)] if $pos < $len;
  937. \@tokens;
  938. }
  939. sub _Outdent {
  940. #
  941. # Remove one level of line-leading tabs or spaces
  942. #
  943. my $text = shift;
  944. $text =~ s/^(\t|[ ]{1,$g_tab_width})//gm;
  945. return $text;
  946. }
  947. sub _Detab {
  948. #
  949. # Cribbed from a post by Bart Lateur:
  950. # <http://www.nntp.perl.org/group/perl.macperl.anyperl/154>
  951. #
  952. my $text = shift;
  953. $text =~ s{(.*?)\t}{$1.(' ' x ($g_tab_width - length($1) % $g_tab_width))}ge;
  954. return $text;
  955. }
  956. 1;
  957. __END__
  958. =pod
  959. =head1 NAME
  960. B<Markdown>
  961. =head1 SYNOPSIS
  962. B<Markdown.pl> [ B<--html4tags> ] [ B<--version> ] [ B<-shortversion> ]
  963. [ I<file> ... ]
  964. =head1 DESCRIPTION
  965. Markdown is a text-to-HTML filter; it translates an easy-to-read /
  966. easy-to-write structured text format into HTML. Markdown's text format
  967. is most similar to that of plain text email, and supports features such
  968. as headers, *emphasis*, code blocks, blockquotes, and links.
  969. Markdown's syntax is designed not as a generic markup language, but
  970. specifically to serve as a front-end to (X)HTML. You can use span-level
  971. HTML tags anywhere in a Markdown document, and you can use block level
  972. HTML tags (like <div> and <table> as well).
  973. For more information about Markdown's syntax, see:
  974. http://daringfireball.net/projects/markdown/
  975. =head1 OPTIONS
  976. Use "--" to end switch parsing. For example, to open a file named "-z", use:
  977. Markdown.pl -- -z
  978. =over 4
  979. =item B<--html4tags>
  980. Use HTML 4 style for empty element tags, e.g.:
  981. <br>
  982. instead of Markdown's default XHTML style tags, e.g.:
  983. <br />
  984. =item B<-v>, B<--version>
  985. Display Markdown's version number and copyright information.
  986. =item B<-s>, B<--shortversion>
  987. Display the short-form version number.
  988. =back
  989. =head1 BUGS
  990. To file bug reports or feature requests (other than topics listed in the
  991. Caveats section above) please send email to:
  992. support@daringfireball.net
  993. Please include with your report: (1) the example input; (2) the output
  994. you expected; (3) the output Markdown actually produced.
  995. =head1 VERSION HISTORY
  996. See the readme file for detailed release notes for this version.
  997. 1.0.1 - 14 Dec 2004
  998. 1.0 - 28 Aug 2004
  999. =head1 AUTHOR
  1000. John Gruber
  1001. http://daringfireball.net
  1002. PHP port and other contributions by Michel Fortin
  1003. http://michelf.com
  1004. =head1 COPYRIGHT AND LICENSE
  1005. Copyright (c) 2003-2004 John Gruber
  1006. <http://daringfireball.net/>
  1007. All rights reserved.
  1008. Redistribution and use in source and binary forms, with or without
  1009. modification, are permitted provided that the following conditions are
  1010. met:
  1011. * Redistributions of source code must retain the above copyright notice,
  1012. this list of conditions and the following disclaimer.
  1013. * Redistributions in binary form must reproduce the above copyright
  1014. notice, this list of conditions and the following disclaimer in the
  1015. documentation and/or other materials provided with the distribution.
  1016. * Neither the name "Markdown" nor the names of its contributors may
  1017. be used to endorse or promote products derived from this software
  1018. without specific prior written permission.
  1019. This software is provided by the copyright holders and contributors "as
  1020. is" and any express or implied warranties, including, but not limited
  1021. to, the implied warranties of merchantability and fitness for a
  1022. particular purpose are disclaimed. In no event shall the copyright owner
  1023. or contributors be liable for any direct, indirect, incidental, special,
  1024. exemplary, or consequential damages (including, but not limited to,
  1025. procurement of substitute goods or services; loss of use, data, or
  1026. profits; or business interruption) however caused and on any theory of
  1027. liability, whether in contract, strict liability, or tort (including
  1028. negligence or otherwise) arising in any way out of the use of this
  1029. software, even if advised of the possibility of such damage.
  1030. =cut