123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277 |
- <?php
- /**
- * Class for parsing CSV files
- *
- * @author Martins Pilsetnieks
- */
- class SpreadsheetReader_CSV implements Iterator, Countable
- {
- /**
- * @var array Options array, pre-populated with the default values.
- */
- private $Options = array(
- 'Delimiter' => ';',
- 'Enclosure' => '"'
- );
- private $Encoding = 'UTF-8';
- private $BOMLength = 0;
- /**
- * @var resource File handle
- */
- private $Handle = false;
- private $Filepath = '';
- private $Index = 0;
- private $CurrentRow = null;
- /**
- * @param string Path to file
- * @param array Options:
- * Enclosure => string CSV enclosure
- * Separator => string CSV separator
- */
- public function __construct($Filepath, array $Options = null)
- {
- $this -> Filepath = $Filepath;
- if (!is_readable($Filepath))
- {
- throw new Exception('SpreadsheetReader_CSV: File not readable ('.$Filepath.')');
- }
- // For safety's sake
- @ini_set('auto_detect_line_endings', true);
- $this -> Options = array_merge($this -> Options, $Options);
- $this -> Handle = fopen($Filepath, 'r');
- // Checking the file for byte-order mark to determine encoding
- $BOM16 = bin2hex(fread($this -> Handle, 2));
- if ($BOM16 == 'fffe')
- {
- $this -> Encoding = 'UTF-16LE';
- //$this -> Encoding = 'UTF-16';
- $this -> BOMLength = 2;
- }
- elseif ($BOM16 == 'feff')
- {
- $this -> Encoding = 'UTF-16BE';
- //$this -> Encoding = 'UTF-16';
- $this -> BOMLength = 2;
- }
- if (!$this -> BOMLength)
- {
- fseek($this -> Handle, 0);
- $BOM32 = bin2hex(fread($this -> Handle, 4));
- if ($BOM32 == '0000feff')
- {
- //$this -> Encoding = 'UTF-32BE';
- $this -> Encoding = 'UTF-32';
- $this -> BOMLength = 4;
- }
- elseif ($BOM32 == 'fffe0000')
- {
- //$this -> Encoding = 'UTF-32LE';
- $this -> Encoding = 'UTF-32';
- $this -> BOMLength = 4;
- }
- }
- fseek($this -> Handle, 0);
- $BOM8 = bin2hex(fread($this -> Handle, 3));
- if ($BOM8 == 'efbbbf')
- {
- $this -> Encoding = 'UTF-8';
- $this -> BOMLength = 3;
- }
- // Seeking the place right after BOM as the start of the real content
- if ($this -> BOMLength)
- {
- fseek($this -> Handle, $this -> BOMLength);
- }
- // Checking for the delimiter if it should be determined automatically
- if (!$this -> Options['Delimiter'])
- {
- // fgetcsv needs single-byte separators
- $Semicolon = ';';
- $Tab = "\t";
- $Comma = ',';
- // Reading the first row and checking if a specific separator character
- // has more columns than others (it means that most likely that is the delimiter).
- $SemicolonCount = count(fgetcsv($this -> Handle, null, $Semicolon));
- fseek($this -> Handle, $this -> BOMLength);
- $TabCount = count(fgetcsv($this -> Handle, null, $Tab));
- fseek($this -> Handle, $this -> BOMLength);
- $CommaCount = count(fgetcsv($this -> Handle, null, $Comma));
- fseek($this -> Handle, $this -> BOMLength);
- $Delimiter = $Semicolon;
- if ($TabCount > $SemicolonCount || $CommaCount > $SemicolonCount)
- {
- $Delimiter = $CommaCount > $TabCount ? $Comma : $Tab;
- }
- $this -> Options['Delimiter'] = $Delimiter;
- }
- }
- /**
- * Returns information about sheets in the file.
- * Because CSV doesn't have any, it's just a single entry.
- *
- * @return array Sheet data
- */
- public function Sheets()
- {
- return array(0 => basename($this -> Filepath));
- }
- /**
- * Changes sheet to another. Because CSV doesn't have any sheets
- * it just rewinds the file so the behaviour is compatible with other
- * sheet readers. (If an invalid index is given, it doesn't do anything.)
- *
- * @param bool Status
- */
- public function ChangeSheet($Index)
- {
- if ($Index == 0)
- {
- $this -> rewind();
- return true;
- }
- return false;
- }
- // !Iterator interface methods
- /**
- * Rewind the Iterator to the first element.
- * Similar to the reset() function for arrays in PHP
- */
- public function rewind()
- {
- fseek($this -> Handle, $this -> BOMLength);
- $this -> CurrentRow = null;
- $this -> Index = 0;
- }
- /**
- * Return the current element.
- * Similar to the current() function for arrays in PHP
- *
- * @return mixed current element from the collection
- */
- public function current()
- {
- if ($this -> Index == 0 && is_null($this -> CurrentRow))
- {
- $this -> next();
- $this -> Index--;
- }
- return $this -> CurrentRow;
- }
- /**
- * Move forward to next element.
- * Similar to the next() function for arrays in PHP
- */
- public function next()
- {
- $this -> CurrentRow = array();
- // Finding the place the next line starts for UTF-16 encoded files
- // Line breaks could be 0x0D 0x00 0x0A 0x00 and PHP could split lines on the
- // first or the second linebreak leaving unnecessary \0 characters that mess up
- // the output.
- if ($this -> Encoding == 'UTF-16LE' || $this -> Encoding == 'UTF-16BE')
- {
- while (!feof($this -> Handle))
- {
- // While bytes are insignificant whitespace, do nothing
- $Char = ord(fgetc($this -> Handle));
- if (!$Char || $Char == 10 || $Char == 13)
- {
- continue;
- }
- else
- {
- // When significant bytes are found, step back to the last place before them
- if ($this -> Encoding == 'UTF-16LE')
- {
- fseek($this -> Handle, ftell($this -> Handle) - 1);
- }
- else
- {
- fseek($this -> Handle, ftell($this -> Handle) - 2);
- }
- break;
- }
- }
- }
- $this -> Index++;
- $this -> CurrentRow = fgetcsv($this -> Handle, null, $this -> Options['Delimiter'], $this -> Options['Enclosure']);
- if ($this -> CurrentRow)
- {
- // Converting multi-byte unicode strings
- // and trimming enclosure symbols off of them because those aren't recognized
- // in the relevan encodings.
- if ($this -> Encoding != 'ASCII' && $this -> Encoding != 'UTF-8')
- {
- $Encoding = $this -> Encoding;
- foreach ($this -> CurrentRow as $Key => $Value)
- {
- $this -> CurrentRow[$Key] = trim(trim(
- mb_convert_encoding($Value, 'UTF-8', $this -> Encoding),
- $this -> Options['Enclosure']
- ));
- }
- }
- }
- return $this -> CurrentRow;
- }
- /**
- * Return the identifying key of the current element.
- * Similar to the key() function for arrays in PHP
- *
- * @return mixed either an integer or a string
- */
- public function key()
- {
- return $this -> Index;
- }
- /**
- * Check if there is a current element after calls to rewind() or next().
- * Used to check if we've iterated to the end of the collection
- *
- * @return boolean FALSE if there's nothing more to iterate over
- */
- public function valid()
- {
- return ($this -> CurrentRow || !feof($this -> Handle));
- }
- // !Countable interface method
- /**
- * Ostensibly should return the count of the contained items but this just returns the number
- * of rows read so far. It's not really correct but at least coherent.
- */
- public function count()
- {
- return $this -> Index + 1;
- }
- }
- ?>
|