csv.c 29 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975
  1. /*
  2. ** 2016-05-28
  3. **
  4. ** The author disclaims copyright to this source code. In place of
  5. ** a legal notice, here is a blessing:
  6. **
  7. ** May you do good and not evil.
  8. ** May you find forgiveness for yourself and forgive others.
  9. ** May you share freely, never taking more than you give.
  10. **
  11. ******************************************************************************
  12. **
  13. ** This file contains the implementation of an SQLite virtual table for
  14. ** reading CSV files.
  15. **
  16. ** Usage:
  17. **
  18. ** .load ./csv
  19. ** CREATE VIRTUAL TABLE temp.csv USING csv(filename=FILENAME);
  20. ** SELECT * FROM csv;
  21. **
  22. ** The columns are named "c1", "c2", "c3", ... by default. Or the
  23. ** application can define its own CREATE TABLE statement using the
  24. ** schema= parameter, like this:
  25. **
  26. ** CREATE VIRTUAL TABLE temp.csv2 USING csv(
  27. ** filename = "../http.log",
  28. ** schema = "CREATE TABLE x(date,ipaddr,url,referrer,userAgent)"
  29. ** );
  30. **
  31. ** Instead of specifying a file, the text of the CSV can be loaded using
  32. ** the data= parameter.
  33. **
  34. ** If the columns=N parameter is supplied, then the CSV file is assumed to have
  35. ** N columns. If both the columns= and schema= parameters are omitted, then
  36. ** the number and names of the columns is determined by the first line of
  37. ** the CSV input.
  38. **
  39. ** Some extra debugging features (used for testing virtual tables) are available
  40. ** if this module is compiled with -DSQLITE_TEST.
  41. */
  42. #include <sqlite3ext.h>
  43. SQLITE_EXTENSION_INIT1
  44. #include <string.h>
  45. #include <stdlib.h>
  46. #include <assert.h>
  47. #include <stdarg.h>
  48. #include <ctype.h>
  49. #include <stdio.h>
  50. #ifndef SQLITE_OMIT_VIRTUALTABLE
  51. /*
  52. ** A macro to hint to the compiler that a function should not be
  53. ** inlined.
  54. */
  55. #if defined(__GNUC__)
  56. # define CSV_NOINLINE __attribute__((noinline))
  57. #elif defined(_MSC_VER) && _MSC_VER>=1310
  58. # define CSV_NOINLINE __declspec(noinline)
  59. #else
  60. # define CSV_NOINLINE
  61. #endif
  62. /* Max size of the error message in a CsvReader */
  63. #define CSV_MXERR 200
  64. /* Size of the CsvReader input buffer */
  65. #define CSV_INBUFSZ 1024
  66. /* A context object used when read a CSV file. */
  67. typedef struct CsvReader CsvReader;
  68. struct CsvReader {
  69. FILE *in; /* Read the CSV text from this input stream */
  70. char *z; /* Accumulated text for a field */
  71. int n; /* Number of bytes in z */
  72. int nAlloc; /* Space allocated for z[] */
  73. int nLine; /* Current line number */
  74. int bNotFirst; /* True if prior text has been seen */
  75. int cTerm; /* Character that terminated the most recent field */
  76. size_t iIn; /* Next unread character in the input buffer */
  77. size_t nIn; /* Number of characters in the input buffer */
  78. char *zIn; /* The input buffer */
  79. char zErr[CSV_MXERR]; /* Error message */
  80. };
  81. /* Initialize a CsvReader object */
  82. static void csv_reader_init(CsvReader *p){
  83. p->in = 0;
  84. p->z = 0;
  85. p->n = 0;
  86. p->nAlloc = 0;
  87. p->nLine = 0;
  88. p->bNotFirst = 0;
  89. p->nIn = 0;
  90. p->zIn = 0;
  91. p->zErr[0] = 0;
  92. }
  93. /* Close and reset a CsvReader object */
  94. static void csv_reader_reset(CsvReader *p){
  95. if( p->in ){
  96. fclose(p->in);
  97. sqlite3_free(p->zIn);
  98. }
  99. sqlite3_free(p->z);
  100. csv_reader_init(p);
  101. }
  102. /* Report an error on a CsvReader */
  103. static void csv_errmsg(CsvReader *p, const char *zFormat, ...){
  104. va_list ap;
  105. va_start(ap, zFormat);
  106. sqlite3_vsnprintf(CSV_MXERR, p->zErr, zFormat, ap);
  107. va_end(ap);
  108. }
  109. /* Open the file associated with a CsvReader
  110. ** Return the number of errors.
  111. */
  112. static int csv_reader_open(
  113. CsvReader *p, /* The reader to open */
  114. const char *zFilename, /* Read from this filename */
  115. const char *zData /* ... or use this data */
  116. ){
  117. if( zFilename ){
  118. p->zIn = sqlite3_malloc( CSV_INBUFSZ );
  119. if( p->zIn==0 ){
  120. csv_errmsg(p, "out of memory");
  121. return 1;
  122. }
  123. p->in = fopen(zFilename, "rb");
  124. if( p->in==0 ){
  125. sqlite3_free(p->zIn);
  126. csv_reader_reset(p);
  127. csv_errmsg(p, "cannot open '%s' for reading", zFilename);
  128. return 1;
  129. }
  130. }else{
  131. assert( p->in==0 );
  132. p->zIn = (char*)zData;
  133. p->nIn = strlen(zData);
  134. }
  135. return 0;
  136. }
  137. /* The input buffer has overflowed. Refill the input buffer, then
  138. ** return the next character
  139. */
  140. static CSV_NOINLINE int csv_getc_refill(CsvReader *p){
  141. size_t got;
  142. assert( p->iIn>=p->nIn ); /* Only called on an empty input buffer */
  143. assert( p->in!=0 ); /* Only called if reading froma file */
  144. got = fread(p->zIn, 1, CSV_INBUFSZ, p->in);
  145. if( got==0 ) return EOF;
  146. p->nIn = got;
  147. p->iIn = 1;
  148. return p->zIn[0];
  149. }
  150. /* Return the next character of input. Return EOF at end of input. */
  151. static int csv_getc(CsvReader *p){
  152. if( p->iIn >= p->nIn ){
  153. if( p->in!=0 ) return csv_getc_refill(p);
  154. return EOF;
  155. }
  156. return ((unsigned char*)p->zIn)[p->iIn++];
  157. }
  158. /* Increase the size of p->z and append character c to the end.
  159. ** Return 0 on success and non-zero if there is an OOM error */
  160. static CSV_NOINLINE int csv_resize_and_append(CsvReader *p, char c){
  161. char *zNew;
  162. int nNew = p->nAlloc*2 + 100;
  163. zNew = sqlite3_realloc64(p->z, nNew);
  164. if( zNew ){
  165. p->z = zNew;
  166. p->nAlloc = nNew;
  167. p->z[p->n++] = c;
  168. return 0;
  169. }else{
  170. csv_errmsg(p, "out of memory");
  171. return 1;
  172. }
  173. }
  174. /* Append a single character to the CsvReader.z[] array.
  175. ** Return 0 on success and non-zero if there is an OOM error */
  176. static int csv_append(CsvReader *p, char c){
  177. if( p->n>=p->nAlloc-1 ) return csv_resize_and_append(p, c);
  178. p->z[p->n++] = c;
  179. return 0;
  180. }
  181. /* Read a single field of CSV text. Compatible with rfc4180 and extended
  182. ** with the option of having a separator other than ",".
  183. **
  184. ** + Input comes from p->in.
  185. ** + Store results in p->z of length p->n. Space to hold p->z comes
  186. ** from sqlite3_malloc64().
  187. ** + Keep track of the line number in p->nLine.
  188. ** + Store the character that terminates the field in p->cTerm. Store
  189. ** EOF on end-of-file.
  190. **
  191. ** Return 0 at EOF or on OOM. On EOF, the p->cTerm character will have
  192. ** been set to EOF.
  193. */
  194. static char *csv_read_one_field(CsvReader *p){
  195. int c;
  196. p->n = 0;
  197. c = csv_getc(p);
  198. if( c==EOF ){
  199. p->cTerm = EOF;
  200. return 0;
  201. }
  202. if( c=='"' ){
  203. int pc, ppc;
  204. int startLine = p->nLine;
  205. pc = ppc = 0;
  206. while( 1 ){
  207. c = csv_getc(p);
  208. if( c<='"' || pc=='"' ){
  209. if( c=='\n' ) p->nLine++;
  210. if( c=='"' ){
  211. if( pc=='"' ){
  212. pc = 0;
  213. continue;
  214. }
  215. }
  216. if( (c==',' && pc=='"')
  217. || (c=='\n' && pc=='"')
  218. || (c=='\n' && pc=='\r' && ppc=='"')
  219. || (c==EOF && pc=='"')
  220. ){
  221. do{ p->n--; }while( p->z[p->n]!='"' );
  222. p->cTerm = (char)c;
  223. break;
  224. }
  225. if( pc=='"' && c!='\r' ){
  226. csv_errmsg(p, "line %d: unescaped %c character", p->nLine, '"');
  227. break;
  228. }
  229. if( c==EOF ){
  230. csv_errmsg(p, "line %d: unterminated %c-quoted field\n",
  231. startLine, '"');
  232. p->cTerm = (char)c;
  233. break;
  234. }
  235. }
  236. if( csv_append(p, (char)c) ) return 0;
  237. ppc = pc;
  238. pc = c;
  239. }
  240. }else{
  241. /* If this is the first field being parsed and it begins with the
  242. ** UTF-8 BOM (0xEF BB BF) then skip the BOM */
  243. if( (c&0xff)==0xef && p->bNotFirst==0 ){
  244. csv_append(p, (char)c);
  245. c = csv_getc(p);
  246. if( (c&0xff)==0xbb ){
  247. csv_append(p, (char)c);
  248. c = csv_getc(p);
  249. if( (c&0xff)==0xbf ){
  250. p->bNotFirst = 1;
  251. p->n = 0;
  252. return csv_read_one_field(p);
  253. }
  254. }
  255. }
  256. while( c>',' || (c!=EOF && c!=',' && c!='\n') ){
  257. if( csv_append(p, (char)c) ) return 0;
  258. c = csv_getc(p);
  259. }
  260. if( c=='\n' ){
  261. p->nLine++;
  262. if( p->n>0 && p->z[p->n-1]=='\r' ) p->n--;
  263. }
  264. p->cTerm = (char)c;
  265. }
  266. assert( p->z==0 || p->n<p->nAlloc );
  267. if( p->z ) p->z[p->n] = 0;
  268. p->bNotFirst = 1;
  269. return p->z;
  270. }
  271. /* Forward references to the various virtual table methods implemented
  272. ** in this file. */
  273. static int csvtabCreate(sqlite3*, void*, int, const char*const*,
  274. sqlite3_vtab**,char**);
  275. static int csvtabConnect(sqlite3*, void*, int, const char*const*,
  276. sqlite3_vtab**,char**);
  277. static int csvtabBestIndex(sqlite3_vtab*,sqlite3_index_info*);
  278. static int csvtabDisconnect(sqlite3_vtab*);
  279. static int csvtabOpen(sqlite3_vtab*, sqlite3_vtab_cursor**);
  280. static int csvtabClose(sqlite3_vtab_cursor*);
  281. static int csvtabFilter(sqlite3_vtab_cursor*, int idxNum, const char *idxStr,
  282. int argc, sqlite3_value **argv);
  283. static int csvtabNext(sqlite3_vtab_cursor*);
  284. static int csvtabEof(sqlite3_vtab_cursor*);
  285. static int csvtabColumn(sqlite3_vtab_cursor*,sqlite3_context*,int);
  286. static int csvtabRowid(sqlite3_vtab_cursor*,sqlite3_int64*);
  287. /* An instance of the CSV virtual table */
  288. typedef struct CsvTable {
  289. sqlite3_vtab base; /* Base class. Must be first */
  290. char *zFilename; /* Name of the CSV file */
  291. char *zData; /* Raw CSV data in lieu of zFilename */
  292. long iStart; /* Offset to start of data in zFilename */
  293. int nCol; /* Number of columns in the CSV file */
  294. unsigned int tstFlags; /* Bit values used for testing */
  295. } CsvTable;
  296. /* Allowed values for tstFlags */
  297. #define CSVTEST_FIDX 0x0001 /* Pretend that constrained searchs cost less*/
  298. /* A cursor for the CSV virtual table */
  299. typedef struct CsvCursor {
  300. sqlite3_vtab_cursor base; /* Base class. Must be first */
  301. CsvReader rdr; /* The CsvReader object */
  302. char **azVal; /* Value of the current row */
  303. int *aLen; /* Length of each entry */
  304. sqlite3_int64 iRowid; /* The current rowid. Negative for EOF */
  305. } CsvCursor;
  306. /* Transfer error message text from a reader into a CsvTable */
  307. static void csv_xfer_error(CsvTable *pTab, CsvReader *pRdr){
  308. sqlite3_free(pTab->base.zErrMsg);
  309. pTab->base.zErrMsg = sqlite3_mprintf("%s", pRdr->zErr);
  310. }
  311. /*
  312. ** This method is the destructor fo a CsvTable object.
  313. */
  314. static int csvtabDisconnect(sqlite3_vtab *pVtab){
  315. CsvTable *p = (CsvTable*)pVtab;
  316. sqlite3_free(p->zFilename);
  317. sqlite3_free(p->zData);
  318. sqlite3_free(p);
  319. return SQLITE_OK;
  320. }
  321. /* Skip leading whitespace. Return a pointer to the first non-whitespace
  322. ** character, or to the zero terminator if the string has only whitespace */
  323. static const char *csv_skip_whitespace(const char *z){
  324. while( isspace((unsigned char)z[0]) ) z++;
  325. return z;
  326. }
  327. /* Remove trailing whitespace from the end of string z[] */
  328. static void csv_trim_whitespace(char *z){
  329. size_t n = strlen(z);
  330. while( n>0 && isspace((unsigned char)z[n]) ) n--;
  331. z[n] = 0;
  332. }
  333. /* Dequote the string */
  334. static void csv_dequote(char *z){
  335. int j;
  336. char cQuote = z[0];
  337. size_t i, n;
  338. if( cQuote!='\'' && cQuote!='"' ) return;
  339. n = strlen(z);
  340. if( n<2 || z[n-1]!=z[0] ) return;
  341. for(i=1, j=0; i<n-1; i++){
  342. if( z[i]==cQuote && z[i+1]==cQuote ) i++;
  343. z[j++] = z[i];
  344. }
  345. z[j] = 0;
  346. }
  347. /* Check to see if the string is of the form: "TAG = VALUE" with optional
  348. ** whitespace before and around tokens. If it is, return a pointer to the
  349. ** first character of VALUE. If it is not, return NULL.
  350. */
  351. static const char *csv_parameter(const char *zTag, int nTag, const char *z){
  352. z = csv_skip_whitespace(z);
  353. if( strncmp(zTag, z, nTag)!=0 ) return 0;
  354. z = csv_skip_whitespace(z+nTag);
  355. if( z[0]!='=' ) return 0;
  356. return csv_skip_whitespace(z+1);
  357. }
  358. /* Decode a parameter that requires a dequoted string.
  359. **
  360. ** Return 1 if the parameter is seen, or 0 if not. 1 is returned
  361. ** even if there is an error. If an error occurs, then an error message
  362. ** is left in p->zErr. If there are no errors, p->zErr[0]==0.
  363. */
  364. static int csv_string_parameter(
  365. CsvReader *p, /* Leave the error message here, if there is one */
  366. const char *zParam, /* Parameter we are checking for */
  367. const char *zArg, /* Raw text of the virtual table argment */
  368. char **pzVal /* Write the dequoted string value here */
  369. ){
  370. const char *zValue;
  371. zValue = csv_parameter(zParam,(int)strlen(zParam),zArg);
  372. if( zValue==0 ) return 0;
  373. p->zErr[0] = 0;
  374. if( *pzVal ){
  375. csv_errmsg(p, "more than one '%s' parameter", zParam);
  376. return 1;
  377. }
  378. *pzVal = sqlite3_mprintf("%s", zValue);
  379. if( *pzVal==0 ){
  380. csv_errmsg(p, "out of memory");
  381. return 1;
  382. }
  383. csv_trim_whitespace(*pzVal);
  384. csv_dequote(*pzVal);
  385. return 1;
  386. }
  387. /* Return 0 if the argument is false and 1 if it is true. Return -1 if
  388. ** we cannot really tell.
  389. */
  390. static int csv_boolean(const char *z){
  391. if( sqlite3_stricmp("yes",z)==0
  392. || sqlite3_stricmp("on",z)==0
  393. || sqlite3_stricmp("true",z)==0
  394. || (z[0]=='1' && z[1]==0)
  395. ){
  396. return 1;
  397. }
  398. if( sqlite3_stricmp("no",z)==0
  399. || sqlite3_stricmp("off",z)==0
  400. || sqlite3_stricmp("false",z)==0
  401. || (z[0]=='0' && z[1]==0)
  402. ){
  403. return 0;
  404. }
  405. return -1;
  406. }
  407. /* Check to see if the string is of the form: "TAG = BOOLEAN" or just "TAG".
  408. ** If it is, set *pValue to be the value of the boolean ("true" if there is
  409. ** not "= BOOLEAN" component) and return non-zero. If the input string
  410. ** does not begin with TAG, return zero.
  411. */
  412. static int csv_boolean_parameter(
  413. const char *zTag, /* Tag we are looking for */
  414. int nTag, /* Size of the tag in bytes */
  415. const char *z, /* Input parameter */
  416. int *pValue /* Write boolean value here */
  417. ){
  418. int b;
  419. z = csv_skip_whitespace(z);
  420. if( strncmp(zTag, z, nTag)!=0 ) return 0;
  421. z = csv_skip_whitespace(z + nTag);
  422. if( z[0]==0 ){
  423. *pValue = 1;
  424. return 1;
  425. }
  426. if( z[0]!='=' ) return 0;
  427. z = csv_skip_whitespace(z+1);
  428. b = csv_boolean(z);
  429. if( b>=0 ){
  430. *pValue = b;
  431. return 1;
  432. }
  433. return 0;
  434. }
  435. /*
  436. ** Parameters:
  437. ** filename=FILENAME Name of file containing CSV content
  438. ** data=TEXT Direct CSV content.
  439. ** schema=SCHEMA Alternative CSV schema.
  440. ** header=YES|NO First row of CSV defines the names of
  441. ** columns if "yes". Default "no".
  442. ** columns=N Assume the CSV file contains N columns.
  443. **
  444. ** Only available if compiled with SQLITE_TEST:
  445. **
  446. ** testflags=N Bitmask of test flags. Optional
  447. **
  448. ** If schema= is omitted, then the columns are named "c0", "c1", "c2",
  449. ** and so forth. If columns=N is omitted, then the file is opened and
  450. ** the number of columns in the first row is counted to determine the
  451. ** column count. If header=YES, then the first row is skipped.
  452. */
  453. static int csvtabConnect(
  454. sqlite3 *db,
  455. void *pAux,
  456. int argc, const char *const*argv,
  457. sqlite3_vtab **ppVtab,
  458. char **pzErr
  459. ){
  460. CsvTable *pNew = 0; /* The CsvTable object to construct */
  461. int bHeader = -1; /* header= flags. -1 means not seen yet */
  462. int rc = SQLITE_OK; /* Result code from this routine */
  463. int i, j; /* Loop counters */
  464. #ifdef SQLITE_TEST
  465. int tstFlags = 0; /* Value for testflags=N parameter */
  466. #endif
  467. int b; /* Value of a boolean parameter */
  468. int nCol = -99; /* Value of the columns= parameter */
  469. CsvReader sRdr; /* A CSV file reader used to store an error
  470. ** message and/or to count the number of columns */
  471. static const char *azParam[] = {
  472. "filename", "data", "schema",
  473. };
  474. char *azPValue[3]; /* Parameter values */
  475. # define CSV_FILENAME (azPValue[0])
  476. # define CSV_DATA (azPValue[1])
  477. # define CSV_SCHEMA (azPValue[2])
  478. assert( sizeof(azPValue)==sizeof(azParam) );
  479. memset(&sRdr, 0, sizeof(sRdr));
  480. memset(azPValue, 0, sizeof(azPValue));
  481. for(i=3; i<argc; i++){
  482. const char *z = argv[i];
  483. const char *zValue;
  484. for(j=0; j<sizeof(azParam)/sizeof(azParam[0]); j++){
  485. if( csv_string_parameter(&sRdr, azParam[j], z, &azPValue[j]) ) break;
  486. }
  487. if( j<sizeof(azParam)/sizeof(azParam[0]) ){
  488. if( sRdr.zErr[0] ) goto csvtab_connect_error;
  489. }else
  490. if( csv_boolean_parameter("header",6,z,&b) ){
  491. if( bHeader>=0 ){
  492. csv_errmsg(&sRdr, "more than one 'header' parameter");
  493. goto csvtab_connect_error;
  494. }
  495. bHeader = b;
  496. }else
  497. #ifdef SQLITE_TEST
  498. if( (zValue = csv_parameter("testflags",9,z))!=0 ){
  499. tstFlags = (unsigned int)atoi(zValue);
  500. }else
  501. #endif
  502. if( (zValue = csv_parameter("columns",7,z))!=0 ){
  503. if( nCol>0 ){
  504. csv_errmsg(&sRdr, "more than one 'columns' parameter");
  505. goto csvtab_connect_error;
  506. }
  507. nCol = atoi(zValue);
  508. if( nCol<=0 ){
  509. csv_errmsg(&sRdr, "column= value must be positive");
  510. goto csvtab_connect_error;
  511. }
  512. }else
  513. {
  514. csv_errmsg(&sRdr, "bad parameter: '%s'", z);
  515. goto csvtab_connect_error;
  516. }
  517. }
  518. if( (CSV_FILENAME==0)==(CSV_DATA==0) ){
  519. csv_errmsg(&sRdr, "must specify either filename= or data= but not both");
  520. goto csvtab_connect_error;
  521. }
  522. if( (nCol<=0 || bHeader==1)
  523. && csv_reader_open(&sRdr, CSV_FILENAME, CSV_DATA)
  524. ){
  525. goto csvtab_connect_error;
  526. }
  527. pNew = sqlite3_malloc( sizeof(*pNew) );
  528. *ppVtab = (sqlite3_vtab*)pNew;
  529. if( pNew==0 ) goto csvtab_connect_oom;
  530. memset(pNew, 0, sizeof(*pNew));
  531. if( CSV_SCHEMA==0 ){
  532. sqlite3_str *pStr = sqlite3_str_new(0);
  533. char *zSep = "";
  534. int iCol = 0;
  535. sqlite3_str_appendf(pStr, "CREATE TABLE x(");
  536. if( nCol<0 && bHeader<1 ){
  537. nCol = 0;
  538. do{
  539. csv_read_one_field(&sRdr);
  540. nCol++;
  541. }while( sRdr.cTerm==',' );
  542. }
  543. if( nCol>0 && bHeader<1 ){
  544. for(iCol=0; iCol<nCol; iCol++){
  545. sqlite3_str_appendf(pStr, "%sc%d TEXT", zSep, iCol);
  546. zSep = ",";
  547. }
  548. }else{
  549. do{
  550. char *z = csv_read_one_field(&sRdr);
  551. if( (nCol>0 && iCol<nCol) || (nCol<0 && bHeader) ){
  552. sqlite3_str_appendf(pStr,"%s\"%w\" TEXT", zSep, z);
  553. zSep = ",";
  554. iCol++;
  555. }
  556. }while( sRdr.cTerm==',' );
  557. if( nCol<0 ){
  558. nCol = iCol;
  559. }else{
  560. while( iCol<nCol ){
  561. sqlite3_str_appendf(pStr,"%sc%d TEXT", zSep, ++iCol);
  562. zSep = ",";
  563. }
  564. }
  565. }
  566. pNew->nCol = nCol;
  567. sqlite3_str_appendf(pStr, ")");
  568. CSV_SCHEMA = sqlite3_str_finish(pStr);
  569. if( CSV_SCHEMA==0 ) goto csvtab_connect_oom;
  570. }else if( nCol<0 ){
  571. do{
  572. csv_read_one_field(&sRdr);
  573. pNew->nCol++;
  574. }while( sRdr.cTerm==',' );
  575. }else{
  576. pNew->nCol = nCol;
  577. }
  578. pNew->zFilename = CSV_FILENAME; CSV_FILENAME = 0;
  579. pNew->zData = CSV_DATA; CSV_DATA = 0;
  580. #ifdef SQLITE_TEST
  581. pNew->tstFlags = tstFlags;
  582. #endif
  583. if( bHeader!=1 ){
  584. pNew->iStart = 0;
  585. }else if( pNew->zData ){
  586. pNew->iStart = (int)sRdr.iIn;
  587. }else{
  588. pNew->iStart = (int)(ftell(sRdr.in) - sRdr.nIn + sRdr.iIn);
  589. }
  590. csv_reader_reset(&sRdr);
  591. rc = sqlite3_declare_vtab(db, CSV_SCHEMA);
  592. if( rc ){
  593. csv_errmsg(&sRdr, "bad schema: '%s' - %s", CSV_SCHEMA, sqlite3_errmsg(db));
  594. goto csvtab_connect_error;
  595. }
  596. for(i=0; i<sizeof(azPValue)/sizeof(azPValue[0]); i++){
  597. sqlite3_free(azPValue[i]);
  598. }
  599. /* Rationale for DIRECTONLY:
  600. ** An attacker who controls a database schema could use this vtab
  601. ** to exfiltrate sensitive data from other files in the filesystem.
  602. ** And, recommended practice is to put all CSV virtual tables in the
  603. ** TEMP namespace, so they should still be usable from within TEMP
  604. ** views, so there shouldn't be a serious loss of functionality by
  605. ** prohibiting the use of this vtab from persistent triggers and views.
  606. */
  607. sqlite3_vtab_config(db, SQLITE_VTAB_DIRECTONLY);
  608. return SQLITE_OK;
  609. csvtab_connect_oom:
  610. rc = SQLITE_NOMEM;
  611. csv_errmsg(&sRdr, "out of memory");
  612. csvtab_connect_error:
  613. if( pNew ) csvtabDisconnect(&pNew->base);
  614. for(i=0; i<sizeof(azPValue)/sizeof(azPValue[0]); i++){
  615. sqlite3_free(azPValue[i]);
  616. }
  617. if( sRdr.zErr[0] ){
  618. sqlite3_free(*pzErr);
  619. *pzErr = sqlite3_mprintf("%s", sRdr.zErr);
  620. }
  621. csv_reader_reset(&sRdr);
  622. if( rc==SQLITE_OK ) rc = SQLITE_ERROR;
  623. return rc;
  624. }
  625. /*
  626. ** Reset the current row content held by a CsvCursor.
  627. */
  628. static void csvtabCursorRowReset(CsvCursor *pCur){
  629. CsvTable *pTab = (CsvTable*)pCur->base.pVtab;
  630. int i;
  631. for(i=0; i<pTab->nCol; i++){
  632. sqlite3_free(pCur->azVal[i]);
  633. pCur->azVal[i] = 0;
  634. pCur->aLen[i] = 0;
  635. }
  636. }
  637. /*
  638. ** The xConnect and xCreate methods do the same thing, but they must be
  639. ** different so that the virtual table is not an eponymous virtual table.
  640. */
  641. static int csvtabCreate(
  642. sqlite3 *db,
  643. void *pAux,
  644. int argc, const char *const*argv,
  645. sqlite3_vtab **ppVtab,
  646. char **pzErr
  647. ){
  648. return csvtabConnect(db, pAux, argc, argv, ppVtab, pzErr);
  649. }
  650. /*
  651. ** Destructor for a CsvCursor.
  652. */
  653. static int csvtabClose(sqlite3_vtab_cursor *cur){
  654. CsvCursor *pCur = (CsvCursor*)cur;
  655. csvtabCursorRowReset(pCur);
  656. csv_reader_reset(&pCur->rdr);
  657. sqlite3_free(cur);
  658. return SQLITE_OK;
  659. }
  660. /*
  661. ** Constructor for a new CsvTable cursor object.
  662. */
  663. static int csvtabOpen(sqlite3_vtab *p, sqlite3_vtab_cursor **ppCursor){
  664. CsvTable *pTab = (CsvTable*)p;
  665. CsvCursor *pCur;
  666. size_t nByte;
  667. nByte = sizeof(*pCur) + (sizeof(char*)+sizeof(int))*pTab->nCol;
  668. pCur = sqlite3_malloc64( nByte );
  669. if( pCur==0 ) return SQLITE_NOMEM;
  670. memset(pCur, 0, nByte);
  671. pCur->azVal = (char**)&pCur[1];
  672. pCur->aLen = (int*)&pCur->azVal[pTab->nCol];
  673. *ppCursor = &pCur->base;
  674. if( csv_reader_open(&pCur->rdr, pTab->zFilename, pTab->zData) ){
  675. csv_xfer_error(pTab, &pCur->rdr);
  676. return SQLITE_ERROR;
  677. }
  678. return SQLITE_OK;
  679. }
  680. /*
  681. ** Advance a CsvCursor to its next row of input.
  682. ** Set the EOF marker if we reach the end of input.
  683. */
  684. static int csvtabNext(sqlite3_vtab_cursor *cur){
  685. CsvCursor *pCur = (CsvCursor*)cur;
  686. CsvTable *pTab = (CsvTable*)cur->pVtab;
  687. int i = 0;
  688. char *z;
  689. do{
  690. z = csv_read_one_field(&pCur->rdr);
  691. if( z==0 ){
  692. break;
  693. }
  694. if( i<pTab->nCol ){
  695. if( pCur->aLen[i] < pCur->rdr.n+1 ){
  696. char *zNew = sqlite3_realloc64(pCur->azVal[i], pCur->rdr.n+1);
  697. if( zNew==0 ){
  698. csv_errmsg(&pCur->rdr, "out of memory");
  699. csv_xfer_error(pTab, &pCur->rdr);
  700. break;
  701. }
  702. pCur->azVal[i] = zNew;
  703. pCur->aLen[i] = pCur->rdr.n+1;
  704. }
  705. memcpy(pCur->azVal[i], z, pCur->rdr.n+1);
  706. i++;
  707. }
  708. }while( pCur->rdr.cTerm==',' );
  709. if( z==0 && i==0 ){
  710. pCur->iRowid = -1;
  711. }else{
  712. pCur->iRowid++;
  713. while( i<pTab->nCol ){
  714. sqlite3_free(pCur->azVal[i]);
  715. pCur->azVal[i] = 0;
  716. pCur->aLen[i] = 0;
  717. i++;
  718. }
  719. }
  720. return SQLITE_OK;
  721. }
  722. /*
  723. ** Return values of columns for the row at which the CsvCursor
  724. ** is currently pointing.
  725. */
  726. static int csvtabColumn(
  727. sqlite3_vtab_cursor *cur, /* The cursor */
  728. sqlite3_context *ctx, /* First argument to sqlite3_result_...() */
  729. int i /* Which column to return */
  730. ){
  731. CsvCursor *pCur = (CsvCursor*)cur;
  732. CsvTable *pTab = (CsvTable*)cur->pVtab;
  733. if( i>=0 && i<pTab->nCol && pCur->azVal[i]!=0 ){
  734. sqlite3_result_text(ctx, pCur->azVal[i], -1, SQLITE_TRANSIENT);
  735. }
  736. return SQLITE_OK;
  737. }
  738. /*
  739. ** Return the rowid for the current row.
  740. */
  741. static int csvtabRowid(sqlite3_vtab_cursor *cur, sqlite_int64 *pRowid){
  742. CsvCursor *pCur = (CsvCursor*)cur;
  743. *pRowid = pCur->iRowid;
  744. return SQLITE_OK;
  745. }
  746. /*
  747. ** Return TRUE if the cursor has been moved off of the last
  748. ** row of output.
  749. */
  750. static int csvtabEof(sqlite3_vtab_cursor *cur){
  751. CsvCursor *pCur = (CsvCursor*)cur;
  752. return pCur->iRowid<0;
  753. }
  754. /*
  755. ** Only a full table scan is supported. So xFilter simply rewinds to
  756. ** the beginning.
  757. */
  758. static int csvtabFilter(
  759. sqlite3_vtab_cursor *pVtabCursor,
  760. int idxNum, const char *idxStr,
  761. int argc, sqlite3_value **argv
  762. ){
  763. CsvCursor *pCur = (CsvCursor*)pVtabCursor;
  764. CsvTable *pTab = (CsvTable*)pVtabCursor->pVtab;
  765. pCur->iRowid = 0;
  766. /* Ensure the field buffer is always allocated. Otherwise, if the
  767. ** first field is zero bytes in size, this may be mistaken for an OOM
  768. ** error in csvtabNext(). */
  769. if( csv_append(&pCur->rdr, 0) ) return SQLITE_NOMEM;
  770. if( pCur->rdr.in==0 ){
  771. assert( pCur->rdr.zIn==pTab->zData );
  772. assert( pTab->iStart>=0 );
  773. assert( (size_t)pTab->iStart<=pCur->rdr.nIn );
  774. pCur->rdr.iIn = pTab->iStart;
  775. }else{
  776. fseek(pCur->rdr.in, pTab->iStart, SEEK_SET);
  777. pCur->rdr.iIn = 0;
  778. pCur->rdr.nIn = 0;
  779. }
  780. return csvtabNext(pVtabCursor);
  781. }
  782. /*
  783. ** Only a forward full table scan is supported. xBestIndex is mostly
  784. ** a no-op. If CSVTEST_FIDX is set, then the presence of equality
  785. ** constraints lowers the estimated cost, which is fiction, but is useful
  786. ** for testing certain kinds of virtual table behavior.
  787. */
  788. static int csvtabBestIndex(
  789. sqlite3_vtab *tab,
  790. sqlite3_index_info *pIdxInfo
  791. ){
  792. pIdxInfo->estimatedCost = 1000000;
  793. #ifdef SQLITE_TEST
  794. if( (((CsvTable*)tab)->tstFlags & CSVTEST_FIDX)!=0 ){
  795. /* The usual (and sensible) case is to always do a full table scan.
  796. ** The code in this branch only runs when testflags=1. This code
  797. ** generates an artifical and unrealistic plan which is useful
  798. ** for testing virtual table logic but is not helpful to real applications.
  799. **
  800. ** Any ==, LIKE, or GLOB constraint is marked as usable by the virtual
  801. ** table (even though it is not) and the cost of running the virtual table
  802. ** is reduced from 1 million to just 10. The constraints are *not* marked
  803. ** as omittable, however, so the query planner should still generate a
  804. ** plan that gives a correct answer, even if they plan is not optimal.
  805. */
  806. int i;
  807. int nConst = 0;
  808. for(i=0; i<pIdxInfo->nConstraint; i++){
  809. unsigned char op;
  810. if( pIdxInfo->aConstraint[i].usable==0 ) continue;
  811. op = pIdxInfo->aConstraint[i].op;
  812. if( op==SQLITE_INDEX_CONSTRAINT_EQ
  813. || op==SQLITE_INDEX_CONSTRAINT_LIKE
  814. || op==SQLITE_INDEX_CONSTRAINT_GLOB
  815. ){
  816. pIdxInfo->estimatedCost = 10;
  817. pIdxInfo->aConstraintUsage[nConst].argvIndex = nConst+1;
  818. nConst++;
  819. }
  820. }
  821. }
  822. #endif
  823. return SQLITE_OK;
  824. }
  825. static sqlite3_module CsvModule = {
  826. 0, /* iVersion */
  827. csvtabCreate, /* xCreate */
  828. csvtabConnect, /* xConnect */
  829. csvtabBestIndex, /* xBestIndex */
  830. csvtabDisconnect, /* xDisconnect */
  831. csvtabDisconnect, /* xDestroy */
  832. csvtabOpen, /* xOpen - open a cursor */
  833. csvtabClose, /* xClose - close a cursor */
  834. csvtabFilter, /* xFilter - configure scan constraints */
  835. csvtabNext, /* xNext - advance a cursor */
  836. csvtabEof, /* xEof - check for end of scan */
  837. csvtabColumn, /* xColumn - read data */
  838. csvtabRowid, /* xRowid - read data */
  839. 0, /* xUpdate */
  840. 0, /* xBegin */
  841. 0, /* xSync */
  842. 0, /* xCommit */
  843. 0, /* xRollback */
  844. 0, /* xFindMethod */
  845. 0, /* xRename */
  846. 0, /* xSavepoint */
  847. 0, /* xRelease */
  848. 0, /* xRollbackTo */
  849. 0, /* xShadowName */
  850. 0 /* xIntegrity */
  851. };
  852. #ifdef SQLITE_TEST
  853. /*
  854. ** For virtual table testing, make a version of the CSV virtual table
  855. ** available that has an xUpdate function. But the xUpdate always returns
  856. ** SQLITE_READONLY since the CSV file is not really writable.
  857. */
  858. static int csvtabUpdate(sqlite3_vtab *p,int n,sqlite3_value**v,sqlite3_int64*x){
  859. return SQLITE_READONLY;
  860. }
  861. static sqlite3_module CsvModuleFauxWrite = {
  862. 0, /* iVersion */
  863. csvtabCreate, /* xCreate */
  864. csvtabConnect, /* xConnect */
  865. csvtabBestIndex, /* xBestIndex */
  866. csvtabDisconnect, /* xDisconnect */
  867. csvtabDisconnect, /* xDestroy */
  868. csvtabOpen, /* xOpen - open a cursor */
  869. csvtabClose, /* xClose - close a cursor */
  870. csvtabFilter, /* xFilter - configure scan constraints */
  871. csvtabNext, /* xNext - advance a cursor */
  872. csvtabEof, /* xEof - check for end of scan */
  873. csvtabColumn, /* xColumn - read data */
  874. csvtabRowid, /* xRowid - read data */
  875. csvtabUpdate, /* xUpdate */
  876. 0, /* xBegin */
  877. 0, /* xSync */
  878. 0, /* xCommit */
  879. 0, /* xRollback */
  880. 0, /* xFindMethod */
  881. 0, /* xRename */
  882. 0, /* xSavepoint */
  883. 0, /* xRelease */
  884. 0, /* xRollbackTo */
  885. 0, /* xShadowName */
  886. 0 /* xIntegrity */
  887. };
  888. #endif /* SQLITE_TEST */
  889. #endif /* !defined(SQLITE_OMIT_VIRTUALTABLE) */
  890. #ifdef _WIN32
  891. __declspec(dllexport)
  892. #endif
  893. /*
  894. ** This routine is called when the extension is loaded. The new
  895. ** CSV virtual table module is registered with the calling database
  896. ** connection.
  897. */
  898. int sqlite3_csv_init(
  899. sqlite3 *db,
  900. char **pzErrMsg,
  901. const sqlite3_api_routines *pApi
  902. ){
  903. #ifndef SQLITE_OMIT_VIRTUALTABLE
  904. int rc;
  905. SQLITE_EXTENSION_INIT2(pApi);
  906. rc = sqlite3_create_module(db, "csv", &CsvModule, 0);
  907. #ifdef SQLITE_TEST
  908. if( rc==SQLITE_OK ){
  909. rc = sqlite3_create_module(db, "csv_wr", &CsvModuleFauxWrite, 0);
  910. }
  911. #endif
  912. return rc;
  913. #else
  914. return SQLITE_OK;
  915. #endif
  916. }