123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324 |
- <?php
- /**
- * StatusNet - the distributed open-source microblogging tool
- * Copyright (C) 2010, StatusNet, Inc.
- *
- * Importer class for Delicious.com backups
- *
- * PHP version 5
- *
- * This program is free software: you can redistribute it and/or modify
- * it under the terms of the GNU Affero General Public License as published by
- * the Free Software Foundation, either version 3 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU Affero General Public License for more details.
- *
- * You should have received a copy of the GNU Affero General Public License
- * along with this program. If not, see <http://www.gnu.org/licenses/>.
- *
- * @category Bookmark
- * @package StatusNet
- * @author Evan Prodromou <evan@status.net>
- * @copyright 2010 StatusNet, Inc.
- * @license http://www.fsf.org/licensing/licenses/agpl-3.0.html AGPL 3.0
- * @link http://status.net/
- */
- if (!defined('STATUSNET')) {
- // This check helps protect against security problems;
- // your code file can't be executed directly from the web.
- exit(1);
- }
- /**
- * Importer class for Delicious bookmarks
- *
- * @category Bookmark
- * @package StatusNet
- * @author Evan Prodromou <evan@status.net>
- * @copyright 2010 StatusNet, Inc.
- * @license http://www.fsf.org/licensing/licenses/agpl-3.0.html AGPL 3.0
- * @link http://status.net/
- */
- class DeliciousBackupImporter extends QueueHandler
- {
- /**
- * Transport of the importer
- *
- * @return string transport string
- */
- function transport()
- {
- return 'dlcsback';
- }
- /**
- * Import an in-memory bookmark list to a user's account
- *
- * Take a delicious.com backup file (same as Netscape bookmarks.html)
- * and import to StatusNet as Bookmark activities.
- *
- * The document format is terrible. It consists of a <dl> with
- * a bunch of <dt>'s, occasionally with <dd>'s adding descriptions.
- * There are sometimes <p>'s lost inside.
- *
- * @param array $data pair of user, text
- *
- * @return boolean success value
- */
- function handle($data)
- {
- list($user, $body) = $data;
- try {
- $doc = $this->importHTML($body);
- } catch (ClientException $cex) {
- // XXX: message to the user
- common_log(LOG_WARNING, $cex->getMessage());
- return true;
- }
- // If we can't parse it, it's no good
- if (empty($doc)) {
- return true;
- }
- $dls = $doc->getElementsByTagName('dl');
- if ($dls->length != 1) {
- // XXX: message to the user
- common_log(LOG_WARNING, 'Bad input file');
- return true;
- }
- $dl = $dls->item(0);
- $children = $dl->childNodes;
- $dt = null;
- for ($i = 0; $i < $children->length; $i++) {
- try {
- $child = $children->item($i);
- if ($child->nodeType != XML_ELEMENT_NODE) {
- continue;
- }
- switch (strtolower($child->tagName)) {
- case 'dt':
- // <dt> nodes contain primary information about a bookmark.
- // We can't import the current one just yet though, since
- // it may be followed by a <dd>.
- if (!empty($dt)) {
- // No DD provided
- $this->importBookmark($user, $dt);
- $dt = null;
- }
- $dt = $child;
- break;
- case 'dd':
- $dd = $child;
- if (!empty($dt)) {
- // This <dd> contains a description for the bookmark in
- // the preceding <dt> node.
- $saved = $this->importBookmark($user, $dt, $dd);
- }
- $dt = null;
- $dd = null;
- break;
- case 'p':
- common_log(LOG_INFO, 'Skipping the <p> in the <dl>.');
- break;
- default:
- common_log(LOG_WARNING,
- "Unexpected element $child->tagName ".
- " found in import.");
- }
- } catch (Exception $e) {
- common_log(LOG_ERR, $e->getMessage());
- $dt = $dd = null;
- }
- }
- if (!empty($dt)) {
- // There was a final bookmark without a description.
- try {
- $this->importBookmark($user, $dt);
- } catch (Exception $e) {
- common_log(LOG_ERR, $e->getMessage());
- }
- }
- return true;
- }
- /**
- * Import a single bookmark
- *
- * Takes a <dt>/<dd> pair. The <dt> has a single
- * <a> in it with some non-standard attributes.
- *
- * A <dt><dt><dd> sequence will appear as a <dt> with
- * anothe <dt> as a child. We handle this case recursively.
- *
- * @param User $user User to import data as
- * @param DOMElement $dt <dt> element
- * @param DOMElement $dd <dd> element
- *
- * @return Notice imported notice
- */
- function importBookmark($user, $dt, $dd = null)
- {
- $as = $dt->getElementsByTagName('a');
- if ($as->length == 0) {
- // TRANS: Client exception thrown when a bookmark in an import file is incorrectly formatted.
- throw new ClientException(_m("No <A> tag in a <DT>."));
- }
- $a = $as->item(0);
- $private = $a->getAttribute('private');
- if ($private != 0) {
- // TRANS: Client exception thrown when a bookmark in an import file is private.
- throw new ClientException(_m('Skipping private bookmark.'));
- }
- if (!empty($dd)) {
- $description = $dd->nodeValue;
- } else {
- $description = null;
- }
- $addDate = $a->getAttribute('add_date');
- $data = array(
- 'profile_id' => $user->id,
- 'title' => $a->nodeValue,
- 'description' => $description,
- 'url' => $a->getAttribute('href'),
- 'tags' => preg_split('/[\s,]+/', $a->getAttribute('tags'), null, PREG_SPLIT_NO_EMPTY),
- 'created' => common_sql_date(intval($addDate))
- );
- $qm = QueueManager::get();
- $qm->enqueue($data, 'dlcsbkmk');
- }
- /**
- * Parse some HTML
- *
- * Hides the errors that the dom parser returns
- *
- * @param string $body Data to import
- *
- * @return DOMDocument parsed document
- */
- function importHTML($body)
- {
- // DOMDocument::loadHTML may throw warnings on unrecognized elements,
- // and notices on unrecognized namespaces.
- $old = error_reporting(error_reporting() & ~(E_WARNING | E_NOTICE));
- $dom = new DOMDocument();
- $ok = $dom->loadHTML($body);
- error_reporting($old);
- if ($ok) {
- foreach ($dom->getElementsByTagName('body') as $node) {
- $this->fixListsIn($node);
- }
- return $dom;
- } else {
- return null;
- }
- }
- function fixListsIn(DOMNode $body) {
- $toFix = array();
- foreach ($body->childNodes as $node) {
- if ($node->nodeType == XML_ELEMENT_NODE) {
- $el = strtolower($node->nodeName);
- if ($el == 'dl') {
- $toFix[] = $node;
- }
- }
- }
- foreach ($toFix as $node) {
- $this->fixList($node);
- }
- }
- function fixList(DOMNode $list) {
- $toFix = array();
- foreach ($list->childNodes as $node) {
- if ($node->nodeType == XML_ELEMENT_NODE) {
- $el = strtolower($node->nodeName);
- if ($el == 'dt' || $el == 'dd') {
- $toFix[] = $node;
- }
- if ($el == 'dl') {
- // Sublist.
- // Technically, these can only appear inside a <dd>...
- $this->fixList($node);
- }
- }
- }
- foreach ($toFix as $node) {
- $this->fixListItem($node);
- }
- }
- function fixListItem(DOMNode $item) {
- // The HTML parser in libxml2 doesn't seem to properly handle
- // many cases of implied close tags, apparently because it doesn't
- // understand the nesting rules specified in the HTML DTD.
- //
- // This leads to sequences of adjacent <dt>s or <dd>s being incorrectly
- // interpreted as parent->child trees instead of siblings:
- //
- // When parsing this input: "<dt>aaa <dt>bbb"
- // should be equivalent to: "<dt>aaa </dt><dt>bbb</dt>"
- // but we're seeing instead: "<dt>aaa <dt>bbb</dt></dt>"
- //
- // It does at least know that going from dt to dd, or dd to dt,
- // should make a break.
- $toMove = array();
- foreach ($item->childNodes as $node) {
- if ($node->nodeType == XML_ELEMENT_NODE) {
- $el = strtolower($node->nodeName);
- if ($el == 'dt' || $el == 'dd') {
- // dt & dd cannot contain each other;
- // This node was incorrectly placed; move it up a level!
- $toMove[] = $node;
- }
- if ($el == 'dl') {
- // Sublist.
- // Technically, these can only appear inside a <dd>.
- $this->fixList($node);
- }
- }
- }
- $parent = $item->parentNode;
- $next = $item->nextSibling;
- foreach ($toMove as $node) {
- $item->removeChild($node);
- $parent->insertBefore($node, $next);
- $this->fixListItem($node);
- }
- }
- }
|