SpreadsheetReader_CSV.php 6.9 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277
  1. <?php
  2. /**
  3. * Class for parsing CSV files
  4. *
  5. * @author Martins Pilsetnieks
  6. */
  7. class SpreadsheetReader_CSV implements Iterator, Countable
  8. {
  9. /**
  10. * @var array Options array, pre-populated with the default values.
  11. */
  12. private $Options = array(
  13. 'Delimiter' => ';',
  14. 'Enclosure' => '"'
  15. );
  16. private $Encoding = 'UTF-8';
  17. private $BOMLength = 0;
  18. /**
  19. * @var resource File handle
  20. */
  21. private $Handle = false;
  22. private $Filepath = '';
  23. private $Index = 0;
  24. private $CurrentRow = null;
  25. /**
  26. * @param string Path to file
  27. * @param array Options:
  28. * Enclosure => string CSV enclosure
  29. * Separator => string CSV separator
  30. */
  31. public function __construct($Filepath, array $Options = null)
  32. {
  33. $this -> Filepath = $Filepath;
  34. if (!is_readable($Filepath))
  35. {
  36. throw new Exception('SpreadsheetReader_CSV: File not readable ('.$Filepath.')');
  37. }
  38. // For safety's sake
  39. @ini_set('auto_detect_line_endings', true);
  40. $this -> Options = array_merge($this -> Options, $Options);
  41. $this -> Handle = fopen($Filepath, 'r');
  42. // Checking the file for byte-order mark to determine encoding
  43. $BOM16 = bin2hex(fread($this -> Handle, 2));
  44. if ($BOM16 == 'fffe')
  45. {
  46. $this -> Encoding = 'UTF-16LE';
  47. //$this -> Encoding = 'UTF-16';
  48. $this -> BOMLength = 2;
  49. }
  50. elseif ($BOM16 == 'feff')
  51. {
  52. $this -> Encoding = 'UTF-16BE';
  53. //$this -> Encoding = 'UTF-16';
  54. $this -> BOMLength = 2;
  55. }
  56. if (!$this -> BOMLength)
  57. {
  58. fseek($this -> Handle, 0);
  59. $BOM32 = bin2hex(fread($this -> Handle, 4));
  60. if ($BOM32 == '0000feff')
  61. {
  62. //$this -> Encoding = 'UTF-32BE';
  63. $this -> Encoding = 'UTF-32';
  64. $this -> BOMLength = 4;
  65. }
  66. elseif ($BOM32 == 'fffe0000')
  67. {
  68. //$this -> Encoding = 'UTF-32LE';
  69. $this -> Encoding = 'UTF-32';
  70. $this -> BOMLength = 4;
  71. }
  72. }
  73. fseek($this -> Handle, 0);
  74. $BOM8 = bin2hex(fread($this -> Handle, 3));
  75. if ($BOM8 == 'efbbbf')
  76. {
  77. $this -> Encoding = 'UTF-8';
  78. $this -> BOMLength = 3;
  79. }
  80. // Seeking the place right after BOM as the start of the real content
  81. if ($this -> BOMLength)
  82. {
  83. fseek($this -> Handle, $this -> BOMLength);
  84. }
  85. // Checking for the delimiter if it should be determined automatically
  86. if (!$this -> Options['Delimiter'])
  87. {
  88. // fgetcsv needs single-byte separators
  89. $Semicolon = ';';
  90. $Tab = "\t";
  91. $Comma = ',';
  92. // Reading the first row and checking if a specific separator character
  93. // has more columns than others (it means that most likely that is the delimiter).
  94. $SemicolonCount = count(fgetcsv($this -> Handle, null, $Semicolon));
  95. fseek($this -> Handle, $this -> BOMLength);
  96. $TabCount = count(fgetcsv($this -> Handle, null, $Tab));
  97. fseek($this -> Handle, $this -> BOMLength);
  98. $CommaCount = count(fgetcsv($this -> Handle, null, $Comma));
  99. fseek($this -> Handle, $this -> BOMLength);
  100. $Delimiter = $Semicolon;
  101. if ($TabCount > $SemicolonCount || $CommaCount > $SemicolonCount)
  102. {
  103. $Delimiter = $CommaCount > $TabCount ? $Comma : $Tab;
  104. }
  105. $this -> Options['Delimiter'] = $Delimiter;
  106. }
  107. }
  108. /**
  109. * Returns information about sheets in the file.
  110. * Because CSV doesn't have any, it's just a single entry.
  111. *
  112. * @return array Sheet data
  113. */
  114. public function Sheets()
  115. {
  116. return array(0 => basename($this -> Filepath));
  117. }
  118. /**
  119. * Changes sheet to another. Because CSV doesn't have any sheets
  120. * it just rewinds the file so the behaviour is compatible with other
  121. * sheet readers. (If an invalid index is given, it doesn't do anything.)
  122. *
  123. * @param bool Status
  124. */
  125. public function ChangeSheet($Index)
  126. {
  127. if ($Index == 0)
  128. {
  129. $this -> rewind();
  130. return true;
  131. }
  132. return false;
  133. }
  134. // !Iterator interface methods
  135. /**
  136. * Rewind the Iterator to the first element.
  137. * Similar to the reset() function for arrays in PHP
  138. */
  139. public function rewind()
  140. {
  141. fseek($this -> Handle, $this -> BOMLength);
  142. $this -> CurrentRow = null;
  143. $this -> Index = 0;
  144. }
  145. /**
  146. * Return the current element.
  147. * Similar to the current() function for arrays in PHP
  148. *
  149. * @return mixed current element from the collection
  150. */
  151. public function current()
  152. {
  153. if ($this -> Index == 0 && is_null($this -> CurrentRow))
  154. {
  155. $this -> next();
  156. $this -> Index--;
  157. }
  158. return $this -> CurrentRow;
  159. }
  160. /**
  161. * Move forward to next element.
  162. * Similar to the next() function for arrays in PHP
  163. */
  164. public function next()
  165. {
  166. $this -> CurrentRow = array();
  167. // Finding the place the next line starts for UTF-16 encoded files
  168. // Line breaks could be 0x0D 0x00 0x0A 0x00 and PHP could split lines on the
  169. // first or the second linebreak leaving unnecessary \0 characters that mess up
  170. // the output.
  171. if ($this -> Encoding == 'UTF-16LE' || $this -> Encoding == 'UTF-16BE')
  172. {
  173. while (!feof($this -> Handle))
  174. {
  175. // While bytes are insignificant whitespace, do nothing
  176. $Char = ord(fgetc($this -> Handle));
  177. if (!$Char || $Char == 10 || $Char == 13)
  178. {
  179. continue;
  180. }
  181. else
  182. {
  183. // When significant bytes are found, step back to the last place before them
  184. if ($this -> Encoding == 'UTF-16LE')
  185. {
  186. fseek($this -> Handle, ftell($this -> Handle) - 1);
  187. }
  188. else
  189. {
  190. fseek($this -> Handle, ftell($this -> Handle) - 2);
  191. }
  192. break;
  193. }
  194. }
  195. }
  196. $this -> Index++;
  197. $this -> CurrentRow = fgetcsv($this -> Handle, null, $this -> Options['Delimiter'], $this -> Options['Enclosure']);
  198. if ($this -> CurrentRow)
  199. {
  200. // Converting multi-byte unicode strings
  201. // and trimming enclosure symbols off of them because those aren't recognized
  202. // in the relevan encodings.
  203. if ($this -> Encoding != 'ASCII' && $this -> Encoding != 'UTF-8')
  204. {
  205. $Encoding = $this -> Encoding;
  206. foreach ($this -> CurrentRow as $Key => $Value)
  207. {
  208. $this -> CurrentRow[$Key] = trim(trim(
  209. mb_convert_encoding($Value, 'UTF-8', $this -> Encoding),
  210. $this -> Options['Enclosure']
  211. ));
  212. }
  213. }
  214. }
  215. return $this -> CurrentRow;
  216. }
  217. /**
  218. * Return the identifying key of the current element.
  219. * Similar to the key() function for arrays in PHP
  220. *
  221. * @return mixed either an integer or a string
  222. */
  223. public function key()
  224. {
  225. return $this -> Index;
  226. }
  227. /**
  228. * Check if there is a current element after calls to rewind() or next().
  229. * Used to check if we've iterated to the end of the collection
  230. *
  231. * @return boolean FALSE if there's nothing more to iterate over
  232. */
  233. public function valid()
  234. {
  235. return ($this -> CurrentRow || !feof($this -> Handle));
  236. }
  237. // !Countable interface method
  238. /**
  239. * Ostensibly should return the count of the contained items but this just returns the number
  240. * of rows read so far. It's not really correct but at least coherent.
  241. */
  242. public function count()
  243. {
  244. return $this -> Index + 1;
  245. }
  246. }
  247. ?>