deliciousbackupimporter.php 9.9 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324
  1. <?php
  2. /**
  3. * StatusNet - the distributed open-source microblogging tool
  4. * Copyright (C) 2010, StatusNet, Inc.
  5. *
  6. * Importer class for Delicious.com backups
  7. *
  8. * PHP version 5
  9. *
  10. * This program is free software: you can redistribute it and/or modify
  11. * it under the terms of the GNU Affero General Public License as published by
  12. * the Free Software Foundation, either version 3 of the License, or
  13. * (at your option) any later version.
  14. *
  15. * This program is distributed in the hope that it will be useful,
  16. * but WITHOUT ANY WARRANTY; without even the implied warranty of
  17. * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  18. * GNU Affero General Public License for more details.
  19. *
  20. * You should have received a copy of the GNU Affero General Public License
  21. * along with this program. If not, see <http://www.gnu.org/licenses/>.
  22. *
  23. * @category Bookmark
  24. * @package StatusNet
  25. * @author Evan Prodromou <evan@status.net>
  26. * @copyright 2010 StatusNet, Inc.
  27. * @license http://www.fsf.org/licensing/licenses/agpl-3.0.html AGPL 3.0
  28. * @link http://status.net/
  29. */
  30. if (!defined('STATUSNET')) {
  31. // This check helps protect against security problems;
  32. // your code file can't be executed directly from the web.
  33. exit(1);
  34. }
  35. /**
  36. * Importer class for Delicious bookmarks
  37. *
  38. * @category Bookmark
  39. * @package StatusNet
  40. * @author Evan Prodromou <evan@status.net>
  41. * @copyright 2010 StatusNet, Inc.
  42. * @license http://www.fsf.org/licensing/licenses/agpl-3.0.html AGPL 3.0
  43. * @link http://status.net/
  44. */
  45. class DeliciousBackupImporter extends QueueHandler
  46. {
  47. /**
  48. * Transport of the importer
  49. *
  50. * @return string transport string
  51. */
  52. function transport()
  53. {
  54. return 'dlcsback';
  55. }
  56. /**
  57. * Import an in-memory bookmark list to a user's account
  58. *
  59. * Take a delicious.com backup file (same as Netscape bookmarks.html)
  60. * and import to StatusNet as Bookmark activities.
  61. *
  62. * The document format is terrible. It consists of a <dl> with
  63. * a bunch of <dt>'s, occasionally with <dd>'s adding descriptions.
  64. * There are sometimes <p>'s lost inside.
  65. *
  66. * @param array $data pair of user, text
  67. *
  68. * @return boolean success value
  69. */
  70. function handle($data)
  71. {
  72. list($user, $body) = $data;
  73. try {
  74. $doc = $this->importHTML($body);
  75. } catch (ClientException $cex) {
  76. // XXX: message to the user
  77. common_log(LOG_WARNING, $cex->getMessage());
  78. return true;
  79. }
  80. // If we can't parse it, it's no good
  81. if (empty($doc)) {
  82. return true;
  83. }
  84. $dls = $doc->getElementsByTagName('dl');
  85. if ($dls->length != 1) {
  86. // XXX: message to the user
  87. common_log(LOG_WARNING, 'Bad input file');
  88. return true;
  89. }
  90. $dl = $dls->item(0);
  91. $children = $dl->childNodes;
  92. $dt = null;
  93. for ($i = 0; $i < $children->length; $i++) {
  94. try {
  95. $child = $children->item($i);
  96. if ($child->nodeType != XML_ELEMENT_NODE) {
  97. continue;
  98. }
  99. switch (strtolower($child->tagName)) {
  100. case 'dt':
  101. // <dt> nodes contain primary information about a bookmark.
  102. // We can't import the current one just yet though, since
  103. // it may be followed by a <dd>.
  104. if (!empty($dt)) {
  105. // No DD provided
  106. $this->importBookmark($user, $dt);
  107. $dt = null;
  108. }
  109. $dt = $child;
  110. break;
  111. case 'dd':
  112. $dd = $child;
  113. if (!empty($dt)) {
  114. // This <dd> contains a description for the bookmark in
  115. // the preceding <dt> node.
  116. $saved = $this->importBookmark($user, $dt, $dd);
  117. }
  118. $dt = null;
  119. $dd = null;
  120. break;
  121. case 'p':
  122. common_log(LOG_INFO, 'Skipping the <p> in the <dl>.');
  123. break;
  124. default:
  125. common_log(LOG_WARNING,
  126. "Unexpected element $child->tagName ".
  127. " found in import.");
  128. }
  129. } catch (Exception $e) {
  130. common_log(LOG_ERR, $e->getMessage());
  131. $dt = $dd = null;
  132. }
  133. }
  134. if (!empty($dt)) {
  135. // There was a final bookmark without a description.
  136. try {
  137. $this->importBookmark($user, $dt);
  138. } catch (Exception $e) {
  139. common_log(LOG_ERR, $e->getMessage());
  140. }
  141. }
  142. return true;
  143. }
  144. /**
  145. * Import a single bookmark
  146. *
  147. * Takes a <dt>/<dd> pair. The <dt> has a single
  148. * <a> in it with some non-standard attributes.
  149. *
  150. * A <dt><dt><dd> sequence will appear as a <dt> with
  151. * anothe <dt> as a child. We handle this case recursively.
  152. *
  153. * @param User $user User to import data as
  154. * @param DOMElement $dt <dt> element
  155. * @param DOMElement $dd <dd> element
  156. *
  157. * @return Notice imported notice
  158. */
  159. function importBookmark($user, $dt, $dd = null)
  160. {
  161. $as = $dt->getElementsByTagName('a');
  162. if ($as->length == 0) {
  163. // TRANS: Client exception thrown when a bookmark in an import file is incorrectly formatted.
  164. throw new ClientException(_m("No <A> tag in a <DT>."));
  165. }
  166. $a = $as->item(0);
  167. $private = $a->getAttribute('private');
  168. if ($private != 0) {
  169. // TRANS: Client exception thrown when a bookmark in an import file is private.
  170. throw new ClientException(_m('Skipping private bookmark.'));
  171. }
  172. if (!empty($dd)) {
  173. $description = $dd->nodeValue;
  174. } else {
  175. $description = null;
  176. }
  177. $addDate = $a->getAttribute('add_date');
  178. $data = array(
  179. 'profile_id' => $user->id,
  180. 'title' => $a->nodeValue,
  181. 'description' => $description,
  182. 'url' => $a->getAttribute('href'),
  183. 'tags' => preg_split('/[\s,]+/', $a->getAttribute('tags'), null, PREG_SPLIT_NO_EMPTY),
  184. 'created' => common_sql_date(intval($addDate))
  185. );
  186. $qm = QueueManager::get();
  187. $qm->enqueue($data, 'dlcsbkmk');
  188. }
  189. /**
  190. * Parse some HTML
  191. *
  192. * Hides the errors that the dom parser returns
  193. *
  194. * @param string $body Data to import
  195. *
  196. * @return DOMDocument parsed document
  197. */
  198. function importHTML($body)
  199. {
  200. // DOMDocument::loadHTML may throw warnings on unrecognized elements,
  201. // and notices on unrecognized namespaces.
  202. $old = error_reporting(error_reporting() & ~(E_WARNING | E_NOTICE));
  203. $dom = new DOMDocument();
  204. $ok = $dom->loadHTML($body);
  205. error_reporting($old);
  206. if ($ok) {
  207. foreach ($dom->getElementsByTagName('body') as $node) {
  208. $this->fixListsIn($node);
  209. }
  210. return $dom;
  211. } else {
  212. return null;
  213. }
  214. }
  215. function fixListsIn(DOMNode $body) {
  216. $toFix = array();
  217. foreach ($body->childNodes as $node) {
  218. if ($node->nodeType == XML_ELEMENT_NODE) {
  219. $el = strtolower($node->nodeName);
  220. if ($el == 'dl') {
  221. $toFix[] = $node;
  222. }
  223. }
  224. }
  225. foreach ($toFix as $node) {
  226. $this->fixList($node);
  227. }
  228. }
  229. function fixList(DOMNode $list) {
  230. $toFix = array();
  231. foreach ($list->childNodes as $node) {
  232. if ($node->nodeType == XML_ELEMENT_NODE) {
  233. $el = strtolower($node->nodeName);
  234. if ($el == 'dt' || $el == 'dd') {
  235. $toFix[] = $node;
  236. }
  237. if ($el == 'dl') {
  238. // Sublist.
  239. // Technically, these can only appear inside a <dd>...
  240. $this->fixList($node);
  241. }
  242. }
  243. }
  244. foreach ($toFix as $node) {
  245. $this->fixListItem($node);
  246. }
  247. }
  248. function fixListItem(DOMNode $item) {
  249. // The HTML parser in libxml2 doesn't seem to properly handle
  250. // many cases of implied close tags, apparently because it doesn't
  251. // understand the nesting rules specified in the HTML DTD.
  252. //
  253. // This leads to sequences of adjacent <dt>s or <dd>s being incorrectly
  254. // interpreted as parent->child trees instead of siblings:
  255. //
  256. // When parsing this input: "<dt>aaa <dt>bbb"
  257. // should be equivalent to: "<dt>aaa </dt><dt>bbb</dt>"
  258. // but we're seeing instead: "<dt>aaa <dt>bbb</dt></dt>"
  259. //
  260. // It does at least know that going from dt to dd, or dd to dt,
  261. // should make a break.
  262. $toMove = array();
  263. foreach ($item->childNodes as $node) {
  264. if ($node->nodeType == XML_ELEMENT_NODE) {
  265. $el = strtolower($node->nodeName);
  266. if ($el == 'dt' || $el == 'dd') {
  267. // dt & dd cannot contain each other;
  268. // This node was incorrectly placed; move it up a level!
  269. $toMove[] = $node;
  270. }
  271. if ($el == 'dl') {
  272. // Sublist.
  273. // Technically, these can only appear inside a <dd>.
  274. $this->fixList($node);
  275. }
  276. }
  277. }
  278. $parent = $item->parentNode;
  279. $next = $item->nextSibling;
  280. foreach ($toMove as $node) {
  281. $item->removeChild($node);
  282. $parent->insertBefore($node, $next);
  283. $this->fixListItem($node);
  284. }
  285. }
  286. }