wr_parser_sa.php 7.0 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239
  1. <?php
  2. /**
  3. * This is the main cli entry point for MediaWiki.
  4. *
  5. * See the README and INSTALL files for basic setup instructions
  6. * and pointers to the online documentation.
  7. *
  8. * ----------
  9. *
  10. * Copyright (C) 2009 Michael Nowak
  11. * Sean Moss-Pultz <sean@openmoko.com>
  12. * Christopher Hall <hsw@openmoko.com>
  13. *
  14. * This program is free software; you can redistribute it and/or modify
  15. * it under the terms of the GNU General Public License as published by
  16. * the Free Software Foundation; either version 2 of the License, or
  17. * (at your option) any later version.
  18. *
  19. * This program is distributed in the hope that it will be useful,
  20. * but WITHOUT ANY WARRANTY; without even the implied warranty of
  21. * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  22. * GNU General Public License for more details.
  23. *
  24. * You should have received a copy of the GNU General Public License along
  25. * with this program; if not, write to the Free Software Foundation, Inc.,
  26. * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
  27. * http://www.gnu.org/copyleft/gpl.html
  28. **/
  29. ini_set( 'memory_limit', '1G' );
  30. ini_set( 'xdebug.max_nesting_level', 250 );
  31. $wgTemplatePrefix = 'Template:';
  32. # Initialise common code
  33. require ( dirname(__FILE__) .'/includes/sa/SetupStandAlone.php' );
  34. require ( dirname(__FILE__) .'/includes/WebStart.php' );
  35. wfProfileIn('wr_parser_sa.php');
  36. # Set parser options
  37. wfSetParserOptions();
  38. if ($argv[1] == "-") {
  39. $fp = STDIN;
  40. } else {
  41. $fp = fopen($argv[1], "r");
  42. }
  43. # make sure output buffering is off before we start it
  44. # this will ensure same effect whether or not ob is enabled already
  45. while (ob_get_level()) {
  46. ob_end_flush();
  47. }
  48. # start output buffering
  49. if (ob_get_length() === false) {
  50. ob_start();
  51. }
  52. $body = '';
  53. $cnt = 0;
  54. $i = 0;
  55. while (!feof($fp)) {
  56. $line = fgets($fp, 8192);
  57. if ($line == "***EOF***\n") {
  58. echo wfParseTextAndWrapWOC(&$body);
  59. if ($i++ % 1000 == 0 ) { # flush every 1000 articles
  60. ob_flush(); # otherwise PHP runs out of
  61. flush(); # memory
  62. }
  63. $body = '';
  64. } else {
  65. $body .= $line;
  66. }
  67. }
  68. fclose($fp);
  69. # Log what the user did, for book-keeping purposes.
  70. wfProfileOut('wr_parser_sa.php');
  71. wfLogProfilingData();
  72. exit(0);
  73. # Global function to set parser options
  74. function wfSetParserOptions() {
  75. global $wgParserOptions, $wgParser;
  76. $wgParserOptions = new ParserOptions(null);
  77. $wgParserOptions->setEditSection(false);
  78. $wgParser->Options($wgParserOptions);
  79. return $wgParserOptions;
  80. }
  81. # Global function for parsing text with ApiMain
  82. function wfParseText($text, $action='parse', $format='xml') {
  83. # Initialise faux request
  84. $cliRequest = new FauxRequest( array( 'action' => &$action, 'text' => &$text, 'format' => &$format ) );
  85. # Initialise api and execute
  86. $processor = new ApiMain($cliRequest);
  87. $processor->execute();
  88. # generate result and print the result
  89. $printer = $processor->createPrinterByName($format);
  90. $result = $processor->getResult();
  91. if ($printer->getNeedsRawData()) {
  92. $result->setRawMode();
  93. }
  94. $result->cleanUpUTF8();
  95. #$printer->profileIn();
  96. $printer->initPrinter(false);
  97. $printer->execute();
  98. $printer->closePrinter();
  99. #$printer->profileOut();
  100. return true;
  101. }
  102. ### Wikipedia Offline Client - Stuff ###########################################
  103. function &wfOutputWrapperWOC($articleTitle, $articleText, $articleLanguageLinks) {
  104. # We store titles as unicode so do conversion here
  105. $articleTitle = htmlspecialchars($articleTitle);
  106. $articleOutput = "<!DOCTYPE html PUBLIC \"-//W3C//DTD XHTML 1.0 Transitional//EN\"\n".
  107. "\"http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd\">\n".
  108. "<html xmlns=\"http://www.w3.org/1999/xhtml\">\n".
  109. " <head>\n".
  110. " <title>".$articleTitle."</title>\n".
  111. " </head>\n".
  112. " <body>\n".
  113. " <h1>".$articleTitle."</h1>\n".
  114. $articleText."\n".
  115. "<div class='noprint lang-links'>" . $articleLanguageLinks . "</div> \n".
  116. " </body>\n".
  117. "</html>\n";
  118. return $articleOutput;
  119. }
  120. # Global function for 'Wikipedia Offline Client'-specific parsing
  121. function &wfParseTextWOC($text) {
  122. global $wgParser, $wgParserOptions, $wgTemplateFileID;
  123. $nlidx = strpos($text, "\n");
  124. $temp_h = trim(substr($text, 0, $nlidx));
  125. $id = strpos($temp_h, ":");
  126. $wgTemplateFileID = trim(substr($temp_h, 0, $id));
  127. $articleTitle = trim(substr($temp_h, $id+1));
  128. $articleMarkup = substr($text, $nlidx + 1);
  129. $title = Title::newFromText($articleTitle);
  130. if (!$title) {
  131. $title = Title::newFromText('NULL Title');
  132. }
  133. $output = $wgParser->parse($articleMarkup, $title, $wgParserOptions, true, true, null);
  134. $articleText = $output->getText();
  135. # Make the language links
  136. $langLinks = "\n <ul>\n";
  137. foreach ($output->getLanguageLinks() as $link){
  138. $encoded = htmlspecialchars($link);
  139. $langLinks .= ' <li><a class="lang-link" href="' . $encoded . '">' . $encoded . "</a></li>\n";
  140. }
  141. $langLinks .= " </ul>\n";
  142. # change the links
  143. $articleText = str_replace(' (page does not exist)">', '">', $articleText);
  144. $articleText = preg_replace('/<a\s[^>]*title="([^"]*)">/', '<a href="$1">', $articleText);
  145. # fix blank tags and pre as start of article
  146. $articleText = preg_replace('/<(dt|dd|li)>\s*<\/\1>/', '', $articleText);
  147. $articleText = preg_replace('/<p>\s*<br\s*\/>\s*<\/p>|(<(ul|dl|ol)>\s*)+(<\/(ul|dl|ol)>\s*)+/', '', $articleText);
  148. $articleText = preg_replace('/^\s*<pre>(.*?)<\/pre>/s', '<p>$1</p>', $articleText);
  149. $articleText = preg_replace('/<p>\s*<br\s*\/>/', '<p>', $articleText);
  150. $articleText = preg_replace('/<p>\s*<br\s*\/>/', '<p>', $articleText);
  151. $articleText = preg_replace('/<a\s+name="([rR]eferences|[nN]otes)"\s+id="([rR]eferences|[nN]otes)"><\/a><h2>\s+<span\s+class="mw-headline">\s*([rR]eferences|[nN]otes)\s*<\/span><\/h2>\s*$/', '', $articleText);
  152. $articleText = str_replace('%25', '%', $articleText);
  153. $ret = array( &$articleTitle, &$articleText, &$langLinks );
  154. return $ret;
  155. }
  156. # Global function for 'WOC'-specific parsing
  157. function &wfParseTextAndWrapWOC($text) {
  158. $result = wfParseTextWOC($text);
  159. $articleTitle = $result[0];
  160. $articleText = $result[1];
  161. $articleLangLinks = $result[2];
  162. $articleOutput = wfOutputWrapperWOC($articleTitle, $articleText, $articleLangLinks);
  163. return $articleOutput;
  164. }
  165. function &wfParseTextAndSkin($text) {
  166. global $wgParser;
  167. return $wgParser;
  168. }
  169. # Global helper function for 'WOC'-specific parsing
  170. /** function for reading a file from end **/
  171. /**
  172. * before you call this function first time on a handle
  173. * the file pointer have to be set at the end of the file '-2'
  174. * e.g. 'fseek($fileHandle, -2, SEEK_END);'
  175. **/
  176. function fgets_reverse ($handle)
  177. {
  178. $s_a = array();
  179. while("" != ($c = fread($handle, 1)))
  180. {
  181. if (ftell($handle) == 0)
  182. {
  183. fseek($handle, 0, SEEK_SET);
  184. break; // we are at the start of the file
  185. }
  186. else
  187. {
  188. fseek($handle, -2, SEEK_CUR);
  189. }
  190. if ($c != "\n" && $c != "")
  191. {
  192. array_push($s_a, $c);
  193. }
  194. else
  195. {
  196. break; // we are at the end of the line
  197. }
  198. }
  199. return implode("", array_reverse($s_a)); // create a string from this array in reversed order
  200. }