123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159 |
- <?php
- /**
- * File name: caută-în-ziare.php
- * Search for persons in ziare.com
- *
- * (C) Copyright 2013 Friedrich-Ebert-Stiftung (http://fes.ro)
- * Author: Tiberiu C. Turbureanu (tct@ceata.org)
- *
- * This file is part of the project funded by FES
- *
- * This is free software; you can redistribute it and/or modify
- * it under the terms of the GNU Affero General Public License as published by
- * the Free Software Foundation; either version 3 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU Affero General Public License for more details.
- *
- * You should have received a copy of the GNU Affero General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
- */
- require_once('utile.php');
- // Load the list of persons
- $persons = array();
- $xmla = new DOMDocument();
- $xmla->load('../20131013-consilieri.xml');
- $xpatha = new DOMXpath($xmla);
- $persons = $xpatha->query("/xml/person");
- foreach($persons as $person)
- {
- $comname = $xpatha->query("comname", $person)->item(0)->nodeValue;
- $surname = $xpatha->query("surname", $person)->item(0)->nodeValue;
- // Load the blacklist of links for this person
- $blacklist = array();
- $xml = new DOMDocument();
- $xml->load('../filtre/'.$comname.'-'.$surname.'.xml');
- $xpath = new DOMXpath($xml);
- $blacklinks = $xpath->query("/xml/link");
- foreach($blacklinks as $bl)
- {
- $blacklist[] = $bl->nodeValue;
- }
- $rss = '<?xml version="1.0" encoding="UTF-8"?>'.PHP_EOL;
- $rss .= '<rss version="2.0" xmlns:atom="http://www.w3.org/2005/Atom"
- xmlns:georss="http://www.georss.org/georss">'.PHP_EOL;
- $rss .= '<channel>'.PHP_EOL;
- $rss .= '<title>Știri despre '.ucfirst($comname).' '.ucfirst($surname).'</title>'.PHP_EOL;
- $rss .= '<link>http://turbureanu.org/consilieri/'.$comname.'-'.$surname.'</link>'.PHP_EOL;
- $rss .= '<description>Consilier în cadrul Consiliului General al Municipiului București</description>'.PHP_EOL;
- $rss .= '<language>ro</language>'.PHP_EOL;
- $rss .= '<ttl>480</ttl>'.PHP_EOL;
- // Start page for search
- $start = 'http://www.ziare.com/cautare/'.$comname.'+'.$surname;
- $html = file_get_contents($start);
- $doc = new DOMDocument();
- $doc->loadHTML($html);
- $xpath = new DOMXpath($doc);
- // Get pagination
- $pag = $xpath->query("//div[@class='paginatie_profil']/div/a[@title='Ultima']")->item(0);
- // Get the number of result pages
- $last = 1;
- if ($pag)
- {
- $last = $pag->getAttribute('href');
- $last = strstr($last, "-pag");
- $last = strstr($last, "pag");
- $last = str_replace("pag", "", $last);
- $last = intval($last);
- }
- for ($i = 1; $i <= $last; $i++)
- {
- // Get the current page of results
- $html = file_get_contents($start.'-pag'.$i);
- $doc = new DOMDocument();
- $doc->loadHTML($html);
- $xpath = new DOMXpath($doc);
-
- $elements = $xpath->query("//div[@id='tag_results']/div[@class='tag_news']");
- foreach ($elements as $e)
- {
- // Get news link
- $l = $xpath->query("h2[@class='titlu_sec']/a", $e)->item(0)->getAttribute('href');
- // Next news if this news link is blacklisted
- if (in_array($l, $blacklist)) continue;
- // Get full news document
- $htmlf = file_get_contents($l);
- $docf = new DOMDocument();
- $docf->loadHTML($htmlf);
- $xpathf = new DOMXpath($docf);
- // Get full description
- $df = $xpathf->query("//div[@class='descriere_main']")->item(0)->nodeValue;
- // Next news if the name is not in the news full description
- if (!stristr($df, $comname.' '.$surname)) continue;
- // Get description
- $d = $xpath->query("p[@class='descriere_sec']", $e)->item(0)->nodeValue;
- // Get news title
- $t = $xpath->query("h2[@class='titlu_sec']/a", $e)->item(0)->nodeValue;
- // Get the date and time the news was published
- $p = $xpath->query("div[@class='comms']/span", $e)->item(0)->nodeValue;
- // Split the date and time into tokens
- strtok($p, ",: ");
- $day = strtok(",: ");
- $month = strtok(",: ");
- $month = $monthnum[$month];
- $year = strtok(",: ");
- strtok(",: ");
- $hour = strtok(",: ");
- $min = strtok(",: ");
- // Set the standard date and time
- $dt = new DateTime();
- $tz = new DateTimeZone("Europe/Bucharest");
- $dt->setTimezone($tz);
- $dt->setDate($year, $month, $day);
- $dt->setTime($hour, $min, "0");
- //print $dt->format('d.m.y H:i:s');
- print $l;
- $rss .= '<item>'.PHP_EOL;
- $rss .= '<title><![CDATA['.$t.']]></title>'.PHP_EOL;
- $rss .= '<description><![CDATA['.$d.']]></description>'.PHP_EOL;
- $rss .= '<pubDate>'.$dt->format('d.m.y H:i:s').'</pubDate>'.PHP_EOL;
- $rss .= '<link>'.$l.'</link>'.PHP_EOL;
- $rss .= '</item>';
- }
- }
- $rss .= '</channel>'.PHP_EOL;
- $rss .= '</rss>'.PHP_EOL;
- $file = fopen('../fluxuri/02-teste/'.$comname.'-'.$surname.'.xml',"w");
- fwrite($file, $rss);
- fclose($file);
- }
- ?>
|