fts5_hash.c 16 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591
  1. /*
  2. ** 2014 August 11
  3. **
  4. ** The author disclaims copyright to this source code. In place of
  5. ** a legal notice, here is a blessing:
  6. **
  7. ** May you do good and not evil.
  8. ** May you find forgiveness for yourself and forgive others.
  9. ** May you share freely, never taking more than you give.
  10. **
  11. ******************************************************************************
  12. **
  13. */
  14. #include "fts5Int.h"
  15. typedef struct Fts5HashEntry Fts5HashEntry;
  16. /*
  17. ** This file contains the implementation of an in-memory hash table used
  18. ** to accumuluate "term -> doclist" content before it is flused to a level-0
  19. ** segment.
  20. */
  21. struct Fts5Hash {
  22. int eDetail; /* Copy of Fts5Config.eDetail */
  23. int *pnByte; /* Pointer to bytes counter */
  24. int nEntry; /* Number of entries currently in hash */
  25. int nSlot; /* Size of aSlot[] array */
  26. Fts5HashEntry *pScan; /* Current ordered scan item */
  27. Fts5HashEntry **aSlot; /* Array of hash slots */
  28. };
  29. /*
  30. ** Each entry in the hash table is represented by an object of the
  31. ** following type. Each object, its key, and its current data are stored
  32. ** in a single memory allocation. The key immediately follows the object
  33. ** in memory. The position list data immediately follows the key data
  34. ** in memory.
  35. **
  36. ** The key is Fts5HashEntry.nKey bytes in size. It consists of a single
  37. ** byte identifying the index (either the main term index or a prefix-index),
  38. ** followed by the term data. For example: "0token". There is no
  39. ** nul-terminator - in this case nKey=6.
  40. **
  41. ** The data that follows the key is in a similar, but not identical format
  42. ** to the doclist data stored in the database. It is:
  43. **
  44. ** * Rowid, as a varint
  45. ** * Position list, without 0x00 terminator.
  46. ** * Size of previous position list and rowid, as a 4 byte
  47. ** big-endian integer.
  48. **
  49. ** iRowidOff:
  50. ** Offset of last rowid written to data area. Relative to first byte of
  51. ** structure.
  52. **
  53. ** nData:
  54. ** Bytes of data written since iRowidOff.
  55. */
  56. struct Fts5HashEntry {
  57. Fts5HashEntry *pHashNext; /* Next hash entry with same hash-key */
  58. Fts5HashEntry *pScanNext; /* Next entry in sorted order */
  59. int nAlloc; /* Total size of allocation */
  60. int iSzPoslist; /* Offset of space for 4-byte poslist size */
  61. int nData; /* Total bytes of data (incl. structure) */
  62. int nKey; /* Length of key in bytes */
  63. u8 bDel; /* Set delete-flag @ iSzPoslist */
  64. u8 bContent; /* Set content-flag (detail=none mode) */
  65. i16 iCol; /* Column of last value written */
  66. int iPos; /* Position of last value written */
  67. i64 iRowid; /* Rowid of last value written */
  68. };
  69. /*
  70. ** Eqivalent to:
  71. **
  72. ** char *fts5EntryKey(Fts5HashEntry *pEntry){ return zKey; }
  73. */
  74. #define fts5EntryKey(p) ( ((char *)(&(p)[1])) )
  75. /*
  76. ** Allocate a new hash table.
  77. */
  78. int sqlite3Fts5HashNew(Fts5Config *pConfig, Fts5Hash **ppNew, int *pnByte){
  79. int rc = SQLITE_OK;
  80. Fts5Hash *pNew;
  81. *ppNew = pNew = (Fts5Hash*)sqlite3_malloc(sizeof(Fts5Hash));
  82. if( pNew==0 ){
  83. rc = SQLITE_NOMEM;
  84. }else{
  85. sqlite3_int64 nByte;
  86. memset(pNew, 0, sizeof(Fts5Hash));
  87. pNew->pnByte = pnByte;
  88. pNew->eDetail = pConfig->eDetail;
  89. pNew->nSlot = 1024;
  90. nByte = sizeof(Fts5HashEntry*) * pNew->nSlot;
  91. pNew->aSlot = (Fts5HashEntry**)sqlite3_malloc64(nByte);
  92. if( pNew->aSlot==0 ){
  93. sqlite3_free(pNew);
  94. *ppNew = 0;
  95. rc = SQLITE_NOMEM;
  96. }else{
  97. memset(pNew->aSlot, 0, (size_t)nByte);
  98. }
  99. }
  100. return rc;
  101. }
  102. /*
  103. ** Free a hash table object.
  104. */
  105. void sqlite3Fts5HashFree(Fts5Hash *pHash){
  106. if( pHash ){
  107. sqlite3Fts5HashClear(pHash);
  108. sqlite3_free(pHash->aSlot);
  109. sqlite3_free(pHash);
  110. }
  111. }
  112. /*
  113. ** Empty (but do not delete) a hash table.
  114. */
  115. void sqlite3Fts5HashClear(Fts5Hash *pHash){
  116. int i;
  117. for(i=0; i<pHash->nSlot; i++){
  118. Fts5HashEntry *pNext;
  119. Fts5HashEntry *pSlot;
  120. for(pSlot=pHash->aSlot[i]; pSlot; pSlot=pNext){
  121. pNext = pSlot->pHashNext;
  122. sqlite3_free(pSlot);
  123. }
  124. }
  125. memset(pHash->aSlot, 0, pHash->nSlot * sizeof(Fts5HashEntry*));
  126. pHash->nEntry = 0;
  127. }
  128. static unsigned int fts5HashKey(int nSlot, const u8 *p, int n){
  129. int i;
  130. unsigned int h = 13;
  131. for(i=n-1; i>=0; i--){
  132. h = (h << 3) ^ h ^ p[i];
  133. }
  134. return (h % nSlot);
  135. }
  136. static unsigned int fts5HashKey2(int nSlot, u8 b, const u8 *p, int n){
  137. int i;
  138. unsigned int h = 13;
  139. for(i=n-1; i>=0; i--){
  140. h = (h << 3) ^ h ^ p[i];
  141. }
  142. h = (h << 3) ^ h ^ b;
  143. return (h % nSlot);
  144. }
  145. /*
  146. ** Resize the hash table by doubling the number of slots.
  147. */
  148. static int fts5HashResize(Fts5Hash *pHash){
  149. int nNew = pHash->nSlot*2;
  150. int i;
  151. Fts5HashEntry **apNew;
  152. Fts5HashEntry **apOld = pHash->aSlot;
  153. apNew = (Fts5HashEntry**)sqlite3_malloc64(nNew*sizeof(Fts5HashEntry*));
  154. if( !apNew ) return SQLITE_NOMEM;
  155. memset(apNew, 0, nNew*sizeof(Fts5HashEntry*));
  156. for(i=0; i<pHash->nSlot; i++){
  157. while( apOld[i] ){
  158. unsigned int iHash;
  159. Fts5HashEntry *p = apOld[i];
  160. apOld[i] = p->pHashNext;
  161. iHash = fts5HashKey(nNew, (u8*)fts5EntryKey(p), p->nKey);
  162. p->pHashNext = apNew[iHash];
  163. apNew[iHash] = p;
  164. }
  165. }
  166. sqlite3_free(apOld);
  167. pHash->nSlot = nNew;
  168. pHash->aSlot = apNew;
  169. return SQLITE_OK;
  170. }
  171. static int fts5HashAddPoslistSize(
  172. Fts5Hash *pHash,
  173. Fts5HashEntry *p,
  174. Fts5HashEntry *p2
  175. ){
  176. int nRet = 0;
  177. if( p->iSzPoslist ){
  178. u8 *pPtr = p2 ? (u8*)p2 : (u8*)p;
  179. int nData = p->nData;
  180. if( pHash->eDetail==FTS5_DETAIL_NONE ){
  181. assert( nData==p->iSzPoslist );
  182. if( p->bDel ){
  183. pPtr[nData++] = 0x00;
  184. if( p->bContent ){
  185. pPtr[nData++] = 0x00;
  186. }
  187. }
  188. }else{
  189. int nSz = (nData - p->iSzPoslist - 1); /* Size in bytes */
  190. int nPos = nSz*2 + p->bDel; /* Value of nPos field */
  191. assert( p->bDel==0 || p->bDel==1 );
  192. if( nPos<=127 ){
  193. pPtr[p->iSzPoslist] = (u8)nPos;
  194. }else{
  195. int nByte = sqlite3Fts5GetVarintLen((u32)nPos);
  196. memmove(&pPtr[p->iSzPoslist + nByte], &pPtr[p->iSzPoslist + 1], nSz);
  197. sqlite3Fts5PutVarint(&pPtr[p->iSzPoslist], nPos);
  198. nData += (nByte-1);
  199. }
  200. }
  201. nRet = nData - p->nData;
  202. if( p2==0 ){
  203. p->iSzPoslist = 0;
  204. p->bDel = 0;
  205. p->bContent = 0;
  206. p->nData = nData;
  207. }
  208. }
  209. return nRet;
  210. }
  211. /*
  212. ** Add an entry to the in-memory hash table. The key is the concatenation
  213. ** of bByte and (pToken/nToken). The value is (iRowid/iCol/iPos).
  214. **
  215. ** (bByte || pToken) -> (iRowid,iCol,iPos)
  216. **
  217. ** Or, if iCol is negative, then the value is a delete marker.
  218. */
  219. int sqlite3Fts5HashWrite(
  220. Fts5Hash *pHash,
  221. i64 iRowid, /* Rowid for this entry */
  222. int iCol, /* Column token appears in (-ve -> delete) */
  223. int iPos, /* Position of token within column */
  224. char bByte, /* First byte of token */
  225. const char *pToken, int nToken /* Token to add or remove to or from index */
  226. ){
  227. unsigned int iHash;
  228. Fts5HashEntry *p;
  229. u8 *pPtr;
  230. int nIncr = 0; /* Amount to increment (*pHash->pnByte) by */
  231. int bNew; /* If non-delete entry should be written */
  232. bNew = (pHash->eDetail==FTS5_DETAIL_FULL);
  233. /* Attempt to locate an existing hash entry */
  234. iHash = fts5HashKey2(pHash->nSlot, (u8)bByte, (const u8*)pToken, nToken);
  235. for(p=pHash->aSlot[iHash]; p; p=p->pHashNext){
  236. char *zKey = fts5EntryKey(p);
  237. if( zKey[0]==bByte
  238. && p->nKey==nToken+1
  239. && memcmp(&zKey[1], pToken, nToken)==0
  240. ){
  241. break;
  242. }
  243. }
  244. /* If an existing hash entry cannot be found, create a new one. */
  245. if( p==0 ){
  246. /* Figure out how much space to allocate */
  247. char *zKey;
  248. sqlite3_int64 nByte = sizeof(Fts5HashEntry) + (nToken+1) + 1 + 64;
  249. if( nByte<128 ) nByte = 128;
  250. /* Grow the Fts5Hash.aSlot[] array if necessary. */
  251. if( (pHash->nEntry*2)>=pHash->nSlot ){
  252. int rc = fts5HashResize(pHash);
  253. if( rc!=SQLITE_OK ) return rc;
  254. iHash = fts5HashKey2(pHash->nSlot, (u8)bByte, (const u8*)pToken, nToken);
  255. }
  256. /* Allocate new Fts5HashEntry and add it to the hash table. */
  257. p = (Fts5HashEntry*)sqlite3_malloc64(nByte);
  258. if( !p ) return SQLITE_NOMEM;
  259. memset(p, 0, sizeof(Fts5HashEntry));
  260. p->nAlloc = (int)nByte;
  261. zKey = fts5EntryKey(p);
  262. zKey[0] = bByte;
  263. memcpy(&zKey[1], pToken, nToken);
  264. assert( iHash==fts5HashKey(pHash->nSlot, (u8*)zKey, nToken+1) );
  265. p->nKey = nToken+1;
  266. zKey[nToken+1] = '\0';
  267. p->nData = nToken+1 + sizeof(Fts5HashEntry);
  268. p->pHashNext = pHash->aSlot[iHash];
  269. pHash->aSlot[iHash] = p;
  270. pHash->nEntry++;
  271. /* Add the first rowid field to the hash-entry */
  272. p->nData += sqlite3Fts5PutVarint(&((u8*)p)[p->nData], iRowid);
  273. p->iRowid = iRowid;
  274. p->iSzPoslist = p->nData;
  275. if( pHash->eDetail!=FTS5_DETAIL_NONE ){
  276. p->nData += 1;
  277. p->iCol = (pHash->eDetail==FTS5_DETAIL_FULL ? 0 : -1);
  278. }
  279. }else{
  280. /* Appending to an existing hash-entry. Check that there is enough
  281. ** space to append the largest possible new entry. Worst case scenario
  282. ** is:
  283. **
  284. ** + 9 bytes for a new rowid,
  285. ** + 4 byte reserved for the "poslist size" varint.
  286. ** + 1 byte for a "new column" byte,
  287. ** + 3 bytes for a new column number (16-bit max) as a varint,
  288. ** + 5 bytes for the new position offset (32-bit max).
  289. */
  290. if( (p->nAlloc - p->nData) < (9 + 4 + 1 + 3 + 5) ){
  291. sqlite3_int64 nNew = p->nAlloc * 2;
  292. Fts5HashEntry *pNew;
  293. Fts5HashEntry **pp;
  294. pNew = (Fts5HashEntry*)sqlite3_realloc64(p, nNew);
  295. if( pNew==0 ) return SQLITE_NOMEM;
  296. pNew->nAlloc = (int)nNew;
  297. for(pp=&pHash->aSlot[iHash]; *pp!=p; pp=&(*pp)->pHashNext);
  298. *pp = pNew;
  299. p = pNew;
  300. }
  301. nIncr -= p->nData;
  302. }
  303. assert( (p->nAlloc - p->nData) >= (9 + 4 + 1 + 3 + 5) );
  304. pPtr = (u8*)p;
  305. /* If this is a new rowid, append the 4-byte size field for the previous
  306. ** entry, and the new rowid for this entry. */
  307. if( iRowid!=p->iRowid ){
  308. u64 iDiff = (u64)iRowid - (u64)p->iRowid;
  309. fts5HashAddPoslistSize(pHash, p, 0);
  310. p->nData += sqlite3Fts5PutVarint(&pPtr[p->nData], iDiff);
  311. p->iRowid = iRowid;
  312. bNew = 1;
  313. p->iSzPoslist = p->nData;
  314. if( pHash->eDetail!=FTS5_DETAIL_NONE ){
  315. p->nData += 1;
  316. p->iCol = (pHash->eDetail==FTS5_DETAIL_FULL ? 0 : -1);
  317. p->iPos = 0;
  318. }
  319. }
  320. if( iCol>=0 ){
  321. if( pHash->eDetail==FTS5_DETAIL_NONE ){
  322. p->bContent = 1;
  323. }else{
  324. /* Append a new column value, if necessary */
  325. assert_nc( iCol>=p->iCol );
  326. if( iCol!=p->iCol ){
  327. if( pHash->eDetail==FTS5_DETAIL_FULL ){
  328. pPtr[p->nData++] = 0x01;
  329. p->nData += sqlite3Fts5PutVarint(&pPtr[p->nData], iCol);
  330. p->iCol = (i16)iCol;
  331. p->iPos = 0;
  332. }else{
  333. bNew = 1;
  334. p->iCol = (i16)(iPos = iCol);
  335. }
  336. }
  337. /* Append the new position offset, if necessary */
  338. if( bNew ){
  339. p->nData += sqlite3Fts5PutVarint(&pPtr[p->nData], iPos - p->iPos + 2);
  340. p->iPos = iPos;
  341. }
  342. }
  343. }else{
  344. /* This is a delete. Set the delete flag. */
  345. p->bDel = 1;
  346. }
  347. nIncr += p->nData;
  348. *pHash->pnByte += nIncr;
  349. return SQLITE_OK;
  350. }
  351. /*
  352. ** Arguments pLeft and pRight point to linked-lists of hash-entry objects,
  353. ** each sorted in key order. This function merges the two lists into a
  354. ** single list and returns a pointer to its first element.
  355. */
  356. static Fts5HashEntry *fts5HashEntryMerge(
  357. Fts5HashEntry *pLeft,
  358. Fts5HashEntry *pRight
  359. ){
  360. Fts5HashEntry *p1 = pLeft;
  361. Fts5HashEntry *p2 = pRight;
  362. Fts5HashEntry *pRet = 0;
  363. Fts5HashEntry **ppOut = &pRet;
  364. while( p1 || p2 ){
  365. if( p1==0 ){
  366. *ppOut = p2;
  367. p2 = 0;
  368. }else if( p2==0 ){
  369. *ppOut = p1;
  370. p1 = 0;
  371. }else{
  372. char *zKey1 = fts5EntryKey(p1);
  373. char *zKey2 = fts5EntryKey(p2);
  374. int nMin = MIN(p1->nKey, p2->nKey);
  375. int cmp = memcmp(zKey1, zKey2, nMin);
  376. if( cmp==0 ){
  377. cmp = p1->nKey - p2->nKey;
  378. }
  379. assert( cmp!=0 );
  380. if( cmp>0 ){
  381. /* p2 is smaller */
  382. *ppOut = p2;
  383. ppOut = &p2->pScanNext;
  384. p2 = p2->pScanNext;
  385. }else{
  386. /* p1 is smaller */
  387. *ppOut = p1;
  388. ppOut = &p1->pScanNext;
  389. p1 = p1->pScanNext;
  390. }
  391. *ppOut = 0;
  392. }
  393. }
  394. return pRet;
  395. }
  396. /*
  397. ** Link all tokens from hash table iHash into a list in sorted order. The
  398. ** tokens are not removed from the hash table.
  399. */
  400. static int fts5HashEntrySort(
  401. Fts5Hash *pHash,
  402. const char *pTerm, int nTerm, /* Query prefix, if any */
  403. Fts5HashEntry **ppSorted
  404. ){
  405. const int nMergeSlot = 32;
  406. Fts5HashEntry **ap;
  407. Fts5HashEntry *pList;
  408. int iSlot;
  409. int i;
  410. *ppSorted = 0;
  411. ap = sqlite3_malloc64(sizeof(Fts5HashEntry*) * nMergeSlot);
  412. if( !ap ) return SQLITE_NOMEM;
  413. memset(ap, 0, sizeof(Fts5HashEntry*) * nMergeSlot);
  414. for(iSlot=0; iSlot<pHash->nSlot; iSlot++){
  415. Fts5HashEntry *pIter;
  416. for(pIter=pHash->aSlot[iSlot]; pIter; pIter=pIter->pHashNext){
  417. if( pTerm==0
  418. || (pIter->nKey>=nTerm && 0==memcmp(fts5EntryKey(pIter), pTerm, nTerm))
  419. ){
  420. Fts5HashEntry *pEntry = pIter;
  421. pEntry->pScanNext = 0;
  422. for(i=0; ap[i]; i++){
  423. pEntry = fts5HashEntryMerge(pEntry, ap[i]);
  424. ap[i] = 0;
  425. }
  426. ap[i] = pEntry;
  427. }
  428. }
  429. }
  430. pList = 0;
  431. for(i=0; i<nMergeSlot; i++){
  432. pList = fts5HashEntryMerge(pList, ap[i]);
  433. }
  434. sqlite3_free(ap);
  435. *ppSorted = pList;
  436. return SQLITE_OK;
  437. }
  438. /*
  439. ** Query the hash table for a doclist associated with term pTerm/nTerm.
  440. */
  441. int sqlite3Fts5HashQuery(
  442. Fts5Hash *pHash, /* Hash table to query */
  443. int nPre,
  444. const char *pTerm, int nTerm, /* Query term */
  445. void **ppOut, /* OUT: Pointer to new object */
  446. int *pnDoclist /* OUT: Size of doclist in bytes */
  447. ){
  448. unsigned int iHash = fts5HashKey(pHash->nSlot, (const u8*)pTerm, nTerm);
  449. char *zKey = 0;
  450. Fts5HashEntry *p;
  451. for(p=pHash->aSlot[iHash]; p; p=p->pHashNext){
  452. zKey = fts5EntryKey(p);
  453. if( nTerm==p->nKey && memcmp(zKey, pTerm, nTerm)==0 ) break;
  454. }
  455. if( p ){
  456. int nHashPre = sizeof(Fts5HashEntry) + nTerm;
  457. int nList = p->nData - nHashPre;
  458. u8 *pRet = (u8*)(*ppOut = sqlite3_malloc64(nPre + nList + 10));
  459. if( pRet ){
  460. Fts5HashEntry *pFaux = (Fts5HashEntry*)&pRet[nPre-nHashPre];
  461. memcpy(&pRet[nPre], &((u8*)p)[nHashPre], nList);
  462. nList += fts5HashAddPoslistSize(pHash, p, pFaux);
  463. *pnDoclist = nList;
  464. }else{
  465. *pnDoclist = 0;
  466. return SQLITE_NOMEM;
  467. }
  468. }else{
  469. *ppOut = 0;
  470. *pnDoclist = 0;
  471. }
  472. return SQLITE_OK;
  473. }
  474. int sqlite3Fts5HashScanInit(
  475. Fts5Hash *p, /* Hash table to query */
  476. const char *pTerm, int nTerm /* Query prefix */
  477. ){
  478. return fts5HashEntrySort(p, pTerm, nTerm, &p->pScan);
  479. }
  480. #ifdef SQLITE_DEBUG
  481. static int fts5HashCount(Fts5Hash *pHash){
  482. int nEntry = 0;
  483. int ii;
  484. for(ii=0; ii<pHash->nSlot; ii++){
  485. Fts5HashEntry *p = 0;
  486. for(p=pHash->aSlot[ii]; p; p=p->pHashNext){
  487. nEntry++;
  488. }
  489. }
  490. return nEntry;
  491. }
  492. #endif
  493. /*
  494. ** Return true if the hash table is empty, false otherwise.
  495. */
  496. int sqlite3Fts5HashIsEmpty(Fts5Hash *pHash){
  497. assert( pHash->nEntry==fts5HashCount(pHash) );
  498. return pHash->nEntry==0;
  499. }
  500. void sqlite3Fts5HashScanNext(Fts5Hash *p){
  501. assert( !sqlite3Fts5HashScanEof(p) );
  502. p->pScan = p->pScan->pScanNext;
  503. }
  504. int sqlite3Fts5HashScanEof(Fts5Hash *p){
  505. return (p->pScan==0);
  506. }
  507. void sqlite3Fts5HashScanEntry(
  508. Fts5Hash *pHash,
  509. const char **pzTerm, /* OUT: term (nul-terminated) */
  510. int *pnTerm, /* OUT: Size of term in bytes */
  511. const u8 **ppDoclist, /* OUT: pointer to doclist */
  512. int *pnDoclist /* OUT: size of doclist in bytes */
  513. ){
  514. Fts5HashEntry *p;
  515. if( (p = pHash->pScan) ){
  516. char *zKey = fts5EntryKey(p);
  517. int nTerm = p->nKey;
  518. fts5HashAddPoslistSize(pHash, p, 0);
  519. *pzTerm = zKey;
  520. *pnTerm = nTerm;
  521. *ppDoclist = (const u8*)&zKey[nTerm];
  522. *pnDoclist = p->nData - (sizeof(Fts5HashEntry) + nTerm);
  523. }else{
  524. *pzTerm = 0;
  525. *pnTerm = 0;
  526. *ppDoclist = 0;
  527. *pnDoclist = 0;
  528. }
  529. }