caută-în-ziare.php 4.8 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159
  1. <?php
  2. /**
  3. * File name: caută-în-ziare.php
  4. * Search for persons in ziare.com
  5. *
  6. * (C) Copyright 2013 Friedrich-Ebert-Stiftung (http://fes.ro)
  7. * Author: Tiberiu C. Turbureanu (tct@ceata.org)
  8. *
  9. * This file is part of the project funded by FES
  10. *
  11. * This is free software; you can redistribute it and/or modify
  12. * it under the terms of the GNU Affero General Public License as published by
  13. * the Free Software Foundation; either version 3 of the License, or
  14. * (at your option) any later version.
  15. *
  16. * This program is distributed in the hope that it will be useful,
  17. * but WITHOUT ANY WARRANTY; without even the implied warranty of
  18. * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  19. * GNU Affero General Public License for more details.
  20. *
  21. * You should have received a copy of the GNU Affero General Public License
  22. * along with this program; if not, write to the Free Software
  23. * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
  24. */
  25. require_once('utile.php');
  26. // Load the list of persons
  27. $persons = array();
  28. $xmla = new DOMDocument();
  29. $xmla->load('../20131013-consilieri.xml');
  30. $xpatha = new DOMXpath($xmla);
  31. $persons = $xpatha->query("/xml/person");
  32. foreach($persons as $person)
  33. {
  34. $comname = $xpatha->query("comname", $person)->item(0)->nodeValue;
  35. $surname = $xpatha->query("surname", $person)->item(0)->nodeValue;
  36. // Load the blacklist of links for this person
  37. $blacklist = array();
  38. $xml = new DOMDocument();
  39. $xml->load('../filtre/'.$comname.'-'.$surname.'.xml');
  40. $xpath = new DOMXpath($xml);
  41. $blacklinks = $xpath->query("/xml/link");
  42. foreach($blacklinks as $bl)
  43. {
  44. $blacklist[] = $bl->nodeValue;
  45. }
  46. $rss = '<?xml version="1.0" encoding="UTF-8"?>'.PHP_EOL;
  47. $rss .= '<rss version="2.0" xmlns:atom="http://www.w3.org/2005/Atom"
  48. xmlns:georss="http://www.georss.org/georss">'.PHP_EOL;
  49. $rss .= '<channel>'.PHP_EOL;
  50. $rss .= '<title>Știri despre '.ucfirst($comname).' '.ucfirst($surname).'</title>'.PHP_EOL;
  51. $rss .= '<link>http://turbureanu.org/consilieri/'.$comname.'-'.$surname.'</link>'.PHP_EOL;
  52. $rss .= '<description>Consilier în cadrul Consiliului General al Municipiului București</description>'.PHP_EOL;
  53. $rss .= '<language>ro</language>'.PHP_EOL;
  54. $rss .= '<ttl>480</ttl>'.PHP_EOL;
  55. // Start page for search
  56. $start = 'http://www.ziare.com/cautare/'.$comname.'+'.$surname;
  57. $html = file_get_contents($start);
  58. $doc = new DOMDocument();
  59. $doc->loadHTML($html);
  60. $xpath = new DOMXpath($doc);
  61. // Get pagination
  62. $pag = $xpath->query("//div[@class='paginatie_profil']/div/a[@title='Ultima']")->item(0);
  63. // Get the number of result pages
  64. $last = 1;
  65. if ($pag)
  66. {
  67. $last = $pag->getAttribute('href');
  68. $last = strstr($last, "-pag");
  69. $last = strstr($last, "pag");
  70. $last = str_replace("pag", "", $last);
  71. $last = intval($last);
  72. }
  73. for ($i = 1; $i <= $last; $i++)
  74. {
  75. // Get the current page of results
  76. $html = file_get_contents($start.'-pag'.$i);
  77. $doc = new DOMDocument();
  78. $doc->loadHTML($html);
  79. $xpath = new DOMXpath($doc);
  80. $elements = $xpath->query("//div[@id='tag_results']/div[@class='tag_news']");
  81. foreach ($elements as $e)
  82. {
  83. // Get news link
  84. $l = $xpath->query("h2[@class='titlu_sec']/a", $e)->item(0)->getAttribute('href');
  85. // Next news if this news link is blacklisted
  86. if (in_array($l, $blacklist)) continue;
  87. // Get full news document
  88. $htmlf = file_get_contents($l);
  89. $docf = new DOMDocument();
  90. $docf->loadHTML($htmlf);
  91. $xpathf = new DOMXpath($docf);
  92. // Get full description
  93. $df = $xpathf->query("//div[@class='descriere_main']")->item(0)->nodeValue;
  94. // Next news if the name is not in the news full description
  95. if (!stristr($df, $comname.' '.$surname)) continue;
  96. // Get description
  97. $d = $xpath->query("p[@class='descriere_sec']", $e)->item(0)->nodeValue;
  98. // Get news title
  99. $t = $xpath->query("h2[@class='titlu_sec']/a", $e)->item(0)->nodeValue;
  100. // Get the date and time the news was published
  101. $p = $xpath->query("div[@class='comms']/span", $e)->item(0)->nodeValue;
  102. // Split the date and time into tokens
  103. strtok($p, ",: ");
  104. $day = strtok(",: ");
  105. $month = strtok(",: ");
  106. $month = $monthnum[$month];
  107. $year = strtok(",: ");
  108. strtok(",: ");
  109. $hour = strtok(",: ");
  110. $min = strtok(",: ");
  111. // Set the standard date and time
  112. $dt = new DateTime();
  113. $tz = new DateTimeZone("Europe/Bucharest");
  114. $dt->setTimezone($tz);
  115. $dt->setDate($year, $month, $day);
  116. $dt->setTime($hour, $min, "0");
  117. //print $dt->format('d.m.y H:i:s');
  118. print $l;
  119. $rss .= '<item>'.PHP_EOL;
  120. $rss .= '<title><![CDATA['.$t.']]></title>'.PHP_EOL;
  121. $rss .= '<description><![CDATA['.$d.']]></description>'.PHP_EOL;
  122. $rss .= '<pubDate>'.$dt->format('d.m.y H:i:s').'</pubDate>'.PHP_EOL;
  123. $rss .= '<link>'.$l.'</link>'.PHP_EOL;
  124. $rss .= '</item>';
  125. }
  126. }
  127. $rss .= '</channel>'.PHP_EOL;
  128. $rss .= '</rss>'.PHP_EOL;
  129. $file = fopen('../fluxuri/02-teste/'.$comname.'-'.$surname.'.xml',"w");
  130. fwrite($file, $rss);
  131. fclose($file);
  132. }
  133. ?>