Ceata
/
pmbplus


			
				
					
						
						
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159
							<?php

/**
 * File name: caută-în-ziare.php
 * Search for persons in ziare.com
 *
 * (C) Copyright 2013 Friedrich-Ebert-Stiftung (http://fes.ro)
 * Author: Tiberiu C. Turbureanu (tct@ceata.org)
 *
 * This file is part of the project funded by FES
 *
 * This is free software; you can redistribute it and/or modify
 * it under the terms of the GNU Affero General Public License as published by
 * the Free Software Foundation; either version 3 of the License, or
 * (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU Affero General Public License for more details.
 *
 * You should have received a copy of the GNU Affero General Public License
 * along with this program; if not, write to the Free Software
 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
 */

require_once('utile.php');

// Load the list of persons
$persons = array();
$xmla = new DOMDocument();
$xmla->load('../20131013-consilieri.xml');
$xpatha = new DOMXpath($xmla);
$persons = $xpatha->query("/xml/person");
foreach($persons as $person)
{

$comname = $xpatha->query("comname", $person)->item(0)->nodeValue;
$surname = $xpatha->query("surname", $person)->item(0)->nodeValue;

// Load the blacklist of links for this person
$blacklist = array();
$xml = new DOMDocument();
$xml->load('../filtre/'.$comname.'-'.$surname.'.xml');
$xpath = new DOMXpath($xml);
$blacklinks = $xpath->query("/xml/link");
foreach($blacklinks as $bl)
{
  $blacklist[] = $bl->nodeValue;
}

$rss  = '<?xml version="1.0" encoding="UTF-8"?>'.PHP_EOL;
$rss .= '<rss version="2.0" xmlns:atom="http://www.w3.org/2005/Atom" 
        xmlns:georss="http://www.georss.org/georss">'.PHP_EOL;
$rss .= '<channel>'.PHP_EOL;
$rss .= '<title>Știri despre '.ucfirst($comname).' '.ucfirst($surname).'</title>'.PHP_EOL;
$rss .= '<link>http://turbureanu.org/consilieri/'.$comname.'-'.$surname.'</link>'.PHP_EOL;
$rss .= '<description>Consilier în cadrul Consiliului General al Municipiului București</description>'.PHP_EOL;
$rss .= '<language>ro</language>'.PHP_EOL;
$rss .= '<ttl>480</ttl>'.PHP_EOL;

// Start page for search
$start = 'http://www.ziare.com/cautare/'.$comname.'+'.$surname;
$html = file_get_contents($start);
$doc = new DOMDocument();
$doc->loadHTML($html);
$xpath = new DOMXpath($doc);

// Get pagination
$pag = $xpath->query("//div[@class='paginatie_profil']/div/a[@title='Ultima']")->item(0);

// Get the number of result pages
$last = 1;
if ($pag)
{
  $last = $pag->getAttribute('href');
  $last = strstr($last, "-pag");
  $last = strstr($last, "pag");
  $last = str_replace("pag", "", $last);
  $last = intval($last);
}

for ($i = 1; $i <= $last; $i++)
{
// Get the current page of results
$html = file_get_contents($start.'-pag'.$i);
$doc = new DOMDocument();
$doc->loadHTML($html);
$xpath = new DOMXpath($doc);
  
$elements = $xpath->query("//div[@id='tag_results']/div[@class='tag_news']");

foreach ($elements as $e)
{
  // Get news link
  $l = $xpath->query("h2[@class='titlu_sec']/a", $e)->item(0)->getAttribute('href');
  // Next news if this news link is blacklisted
  if (in_array($l, $blacklist)) continue;

  // Get full news document
  $htmlf = file_get_contents($l);
  $docf = new DOMDocument();
  $docf->loadHTML($htmlf);
  $xpathf = new DOMXpath($docf);
  // Get full description
  $df = $xpathf->query("//div[@class='descriere_main']")->item(0)->nodeValue;
  // Next news if the name is not in the news full description
  if (!stristr($df, $comname.' '.$surname)) continue;

  // Get description
  $d = $xpath->query("p[@class='descriere_sec']", $e)->item(0)->nodeValue;

  // Get news title
  $t = $xpath->query("h2[@class='titlu_sec']/a", $e)->item(0)->nodeValue;

  // Get the date and time the news was published
  $p = $xpath->query("div[@class='comms']/span", $e)->item(0)->nodeValue;

  // Split the date and time into tokens
  strtok($p, ",: ");
  $day = strtok(",: ");
  $month = strtok(",: ");
  $month = $monthnum[$month];
  $year = strtok(",: ");
  strtok(",: ");
  $hour =  strtok(",: ");
  $min = strtok(",: ");

  // Set the standard date and time
  $dt = new DateTime();
  $tz = new DateTimeZone("Europe/Bucharest");
  $dt->setTimezone($tz);
  $dt->setDate($year, $month, $day);
  $dt->setTime($hour, $min, "0");

  //print $dt->format('d.m.y H:i:s');
  print $l;

  $rss .= '<item>'.PHP_EOL;
  $rss .= '<title><![CDATA['.$t.']]></title>'.PHP_EOL;
  $rss .= '<description><![CDATA['.$d.']]></description>'.PHP_EOL; 
  $rss .= '<pubDate>'.$dt->format('d.m.y H:i:s').'</pubDate>'.PHP_EOL;
  $rss .= '<link>'.$l.'</link>'.PHP_EOL;
  $rss .= '</item>'; 
}

}

$rss .= '</channel>'.PHP_EOL;
$rss .= '</rss>'.PHP_EOL;

$file = fopen('../fluxuri/02-teste/'.$comname.'-'.$surname.'.xml',"w");
fwrite($file, $rss);
fclose($file);

}

?>