lsm_ckpt.c 38 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207120812091210121112121213121412151216121712181219122012211222122312241225122612271228122912301231123212331234123512361237123812391240
  1. /*
  2. ** 2011-09-11
  3. **
  4. ** The author disclaims copyright to this source code. In place of
  5. ** a legal notice, here is a blessing:
  6. **
  7. ** May you do good and not evil.
  8. ** May you find forgiveness for yourself and forgive others.
  9. ** May you share freely, never taking more than you give.
  10. **
  11. *************************************************************************
  12. **
  13. ** This file contains code to read and write checkpoints.
  14. **
  15. ** A checkpoint represents the database layout at a single point in time.
  16. ** It includes a log offset. When an existing database is opened, the
  17. ** current state is determined by reading the newest checkpoint and updating
  18. ** it with all committed transactions from the log that follow the specified
  19. ** offset.
  20. */
  21. #include "lsmInt.h"
  22. /*
  23. ** CHECKPOINT BLOB FORMAT:
  24. **
  25. ** A checkpoint blob is a series of unsigned 32-bit integers stored in
  26. ** big-endian byte order. As follows:
  27. **
  28. ** Checkpoint header (see the CKPT_HDR_XXX #defines):
  29. **
  30. ** 1. The checkpoint id MSW.
  31. ** 2. The checkpoint id LSW.
  32. ** 3. The number of integer values in the entire checkpoint, including
  33. ** the two checksum values.
  34. ** 4. The compression scheme id.
  35. ** 5. The total number of blocks in the database.
  36. ** 6. The block size.
  37. ** 7. The number of levels.
  38. ** 8. The nominal database page size.
  39. ** 9. The number of pages (in total) written to the database file.
  40. **
  41. ** Log pointer:
  42. **
  43. ** 1. The log offset MSW.
  44. ** 2. The log offset LSW.
  45. ** 3. Log checksum 0.
  46. ** 4. Log checksum 1.
  47. **
  48. ** Note that the "log offset" is not the literal byte offset. Instead,
  49. ** it is the byte offset multiplied by 2, with least significant bit
  50. ** toggled each time the log pointer value is changed. This is to make
  51. ** sure that this field changes each time the log pointer is updated,
  52. ** even if the log file itself is disabled. See lsmTreeMakeOld().
  53. **
  54. ** See ckptExportLog() and ckptImportLog().
  55. **
  56. ** Append points:
  57. **
  58. ** 8 integers (4 * 64-bit page numbers). See ckptExportAppendlist().
  59. **
  60. ** For each level in the database, a level record. Formatted as follows:
  61. **
  62. ** 0. Age of the level (least significant 16-bits). And flags mask (most
  63. ** significant 16-bits).
  64. ** 1. The number of right-hand segments (nRight, possibly 0),
  65. ** 2. Segment record for left-hand segment (8 integers defined below),
  66. ** 3. Segment record for each right-hand segment (8 integers defined below),
  67. ** 4. If nRight>0, The number of segments involved in the merge
  68. ** 5. if nRight>0, Current nSkip value (see Merge structure defn.),
  69. ** 6. For each segment in the merge:
  70. ** 5a. Page number of next cell to read during merge (this field
  71. ** is 64-bits - 2 integers)
  72. ** 5b. Cell number of next cell to read during merge
  73. ** 7. Page containing current split-key (64-bits - 2 integers).
  74. ** 8. Cell within page containing current split-key.
  75. ** 9. Current pointer value (64-bits - 2 integers).
  76. **
  77. ** The block redirect array:
  78. **
  79. ** 1. Number of redirections (maximum LSM_MAX_BLOCK_REDIRECTS).
  80. ** 2. For each redirection:
  81. ** a. "from" block number
  82. ** b. "to" block number
  83. **
  84. ** The in-memory freelist entries. Each entry is either an insert or a
  85. ** delete. The in-memory freelist is to the free-block-list as the
  86. ** in-memory tree is to the users database content.
  87. **
  88. ** 1. Number of free-list entries stored in checkpoint header.
  89. ** 2. Number of free blocks (in total).
  90. ** 3. Total number of blocks freed during database lifetime.
  91. ** 4. For each entry:
  92. ** 2a. Block number of free block.
  93. ** 2b. A 64-bit integer (MSW followed by LSW). -1 for a delete entry,
  94. ** or the associated checkpoint id for an insert.
  95. **
  96. ** The checksum:
  97. **
  98. ** 1. Checksum value 1.
  99. ** 2. Checksum value 2.
  100. **
  101. ** In the above, a segment record consists of the following four 64-bit
  102. ** fields (converted to 2 * u32 by storing the MSW followed by LSW):
  103. **
  104. ** 1. First page of array,
  105. ** 2. Last page of array,
  106. ** 3. Root page of array (or 0),
  107. ** 4. Size of array in pages.
  108. */
  109. /*
  110. ** LARGE NUMBERS OF LEVEL RECORDS:
  111. **
  112. ** A limit on the number of rhs segments that may be present in the database
  113. ** file. Defining this limit ensures that all level records fit within
  114. ** the 4096 byte limit for checkpoint blobs.
  115. **
  116. ** The number of right-hand-side segments in a database is counted as
  117. ** follows:
  118. **
  119. ** * For each level in the database not undergoing a merge, add 1.
  120. **
  121. ** * For each level in the database that is undergoing a merge, add
  122. ** the number of segments on the rhs of the level.
  123. **
  124. ** A level record not undergoing a merge is 10 integers. A level record
  125. ** with nRhs rhs segments and (nRhs+1) input segments (i.e. including the
  126. ** separators from the next level) is (11*nRhs+20) integers. The maximum
  127. ** per right-hand-side level is therefore 21 integers. So the maximum
  128. ** size of all level records in a checkpoint is 21*40=820 integers.
  129. **
  130. ** TODO: Before pointer values were changed from 32 to 64 bits, the above
  131. ** used to come to 420 bytes - leaving significant space for a free-list
  132. ** prefix. No more. To fix this, reduce the size of the level records in
  133. ** a db snapshot, and improve management of the free-list tail in
  134. ** lsm_sorted.c.
  135. */
  136. #define LSM_MAX_RHS_SEGMENTS 40
  137. /*
  138. ** LARGE NUMBERS OF FREELIST ENTRIES:
  139. **
  140. ** There is also a limit (LSM_MAX_FREELIST_ENTRIES - defined in lsmInt.h)
  141. ** on the number of free-list entries stored in a checkpoint. Since each
  142. ** free-list entry consists of 3 integers, the maximum free-list size is
  143. ** 3*100=300 integers. Combined with the limit on rhs segments defined
  144. ** above, this ensures that a checkpoint always fits within a 4096 byte
  145. ** meta page.
  146. **
  147. ** If the database contains more than 100 free blocks, the "overflow" flag
  148. ** in the checkpoint header is set and the remainder are stored in the
  149. ** system FREELIST entry in the LSM (along with user data). The value
  150. ** accompanying the FREELIST key in the LSM is, like a checkpoint, an array
  151. ** of 32-bit big-endian integers. As follows:
  152. **
  153. ** For each entry:
  154. ** a. Block number of free block.
  155. ** b. MSW of associated checkpoint id.
  156. ** c. LSW of associated checkpoint id.
  157. **
  158. ** The number of entries is not required - it is implied by the size of the
  159. ** value blob containing the integer array.
  160. **
  161. ** Note that the limit defined by LSM_MAX_FREELIST_ENTRIES is a hard limit.
  162. ** The actual value used may be configured using LSM_CONFIG_MAX_FREELIST.
  163. */
  164. /*
  165. ** The argument to this macro must be of type u32. On a little-endian
  166. ** architecture, it returns the u32 value that results from interpreting
  167. ** the 4 bytes as a big-endian value. On a big-endian architecture, it
  168. ** returns the value that would be produced by interpreting the 4 bytes
  169. ** of the input value as a little-endian integer.
  170. */
  171. #define BYTESWAP32(x) ( \
  172. (((x)&0x000000FF)<<24) + (((x)&0x0000FF00)<<8) \
  173. + (((x)&0x00FF0000)>>8) + (((x)&0xFF000000)>>24) \
  174. )
  175. static const int one = 1;
  176. #define LSM_LITTLE_ENDIAN (*(u8 *)(&one))
  177. /* Sizes, in integers, of various parts of the checkpoint. */
  178. #define CKPT_HDR_SIZE 9
  179. #define CKPT_LOGPTR_SIZE 4
  180. #define CKPT_APPENDLIST_SIZE (LSM_APPLIST_SZ * 2)
  181. /* A #define to describe each integer in the checkpoint header. */
  182. #define CKPT_HDR_ID_MSW 0
  183. #define CKPT_HDR_ID_LSW 1
  184. #define CKPT_HDR_NCKPT 2
  185. #define CKPT_HDR_CMPID 3
  186. #define CKPT_HDR_NBLOCK 4
  187. #define CKPT_HDR_BLKSZ 5
  188. #define CKPT_HDR_NLEVEL 6
  189. #define CKPT_HDR_PGSZ 7
  190. #define CKPT_HDR_NWRITE 8
  191. #define CKPT_HDR_LO_MSW 9
  192. #define CKPT_HDR_LO_LSW 10
  193. #define CKPT_HDR_LO_CKSUM1 11
  194. #define CKPT_HDR_LO_CKSUM2 12
  195. typedef struct CkptBuffer CkptBuffer;
  196. /*
  197. ** Dynamic buffer used to accumulate data for a checkpoint.
  198. */
  199. struct CkptBuffer {
  200. lsm_env *pEnv;
  201. int nAlloc;
  202. u32 *aCkpt;
  203. };
  204. /*
  205. ** Calculate the checksum of the checkpoint specified by arguments aCkpt and
  206. ** nCkpt. Store the checksum in *piCksum1 and *piCksum2 before returning.
  207. **
  208. ** The value of the nCkpt parameter includes the two checksum values at
  209. ** the end of the checkpoint. They are not used as inputs to the checksum
  210. ** calculation. The checksum is based on the array of (nCkpt-2) integers
  211. ** at aCkpt[].
  212. */
  213. static void ckptChecksum(u32 *aCkpt, u32 nCkpt, u32 *piCksum1, u32 *piCksum2){
  214. u32 i;
  215. u32 cksum1 = 1;
  216. u32 cksum2 = 2;
  217. if( nCkpt % 2 ){
  218. cksum1 += aCkpt[nCkpt-3] & 0x0000FFFF;
  219. cksum2 += aCkpt[nCkpt-3] & 0xFFFF0000;
  220. }
  221. for(i=0; (i+3)<nCkpt; i+=2){
  222. cksum1 += cksum2 + aCkpt[i];
  223. cksum2 += cksum1 + aCkpt[i+1];
  224. }
  225. *piCksum1 = cksum1;
  226. *piCksum2 = cksum2;
  227. }
  228. /*
  229. ** Set integer iIdx of the checkpoint accumulating in buffer *p to iVal.
  230. */
  231. static void ckptSetValue(CkptBuffer *p, int iIdx, u32 iVal, int *pRc){
  232. if( *pRc ) return;
  233. if( iIdx>=p->nAlloc ){
  234. int nNew = LSM_MAX(8, iIdx*2);
  235. p->aCkpt = (u32 *)lsmReallocOrFree(p->pEnv, p->aCkpt, nNew*sizeof(u32));
  236. if( !p->aCkpt ){
  237. *pRc = LSM_NOMEM_BKPT;
  238. return;
  239. }
  240. p->nAlloc = nNew;
  241. }
  242. p->aCkpt[iIdx] = iVal;
  243. }
  244. /*
  245. ** Argument aInt points to an array nInt elements in size. Switch the
  246. ** endian-ness of each element of the array.
  247. */
  248. static void ckptChangeEndianness(u32 *aInt, int nInt){
  249. if( LSM_LITTLE_ENDIAN ){
  250. int i;
  251. for(i=0; i<nInt; i++) aInt[i] = BYTESWAP32(aInt[i]);
  252. }
  253. }
  254. /*
  255. ** Object *p contains a checkpoint in native byte-order. The checkpoint is
  256. ** nCkpt integers in size, not including any checksum. This function sets
  257. ** the two checksum elements of the checkpoint accordingly.
  258. */
  259. static void ckptAddChecksum(CkptBuffer *p, int nCkpt, int *pRc){
  260. if( *pRc==LSM_OK ){
  261. u32 aCksum[2] = {0, 0};
  262. ckptChecksum(p->aCkpt, nCkpt+2, &aCksum[0], &aCksum[1]);
  263. ckptSetValue(p, nCkpt, aCksum[0], pRc);
  264. ckptSetValue(p, nCkpt+1, aCksum[1], pRc);
  265. }
  266. }
  267. static void ckptAppend64(CkptBuffer *p, int *piOut, i64 iVal, int *pRc){
  268. int iOut = *piOut;
  269. ckptSetValue(p, iOut++, (iVal >> 32) & 0xFFFFFFFF, pRc);
  270. ckptSetValue(p, iOut++, (iVal & 0xFFFFFFFF), pRc);
  271. *piOut = iOut;
  272. }
  273. static i64 ckptRead64(u32 *a){
  274. return (((i64)a[0]) << 32) + (i64)a[1];
  275. }
  276. static i64 ckptGobble64(u32 *a, int *piIn){
  277. int iIn = *piIn;
  278. *piIn += 2;
  279. return ckptRead64(&a[iIn]);
  280. }
  281. /*
  282. ** Append a 6-value segment record corresponding to pSeg to the checkpoint
  283. ** buffer passed as the third argument.
  284. */
  285. static void ckptExportSegment(
  286. Segment *pSeg,
  287. CkptBuffer *p,
  288. int *piOut,
  289. int *pRc
  290. ){
  291. ckptAppend64(p, piOut, pSeg->iFirst, pRc);
  292. ckptAppend64(p, piOut, pSeg->iLastPg, pRc);
  293. ckptAppend64(p, piOut, pSeg->iRoot, pRc);
  294. ckptAppend64(p, piOut, pSeg->nSize, pRc);
  295. }
  296. static void ckptExportLevel(
  297. Level *pLevel, /* Level object to serialize */
  298. CkptBuffer *p, /* Append new level record to this ckpt */
  299. int *piOut, /* IN/OUT: Size of checkpoint so far */
  300. int *pRc /* IN/OUT: Error code */
  301. ){
  302. int iOut = *piOut;
  303. Merge *pMerge;
  304. pMerge = pLevel->pMerge;
  305. ckptSetValue(p, iOut++, (u32)pLevel->iAge + (u32)(pLevel->flags<<16), pRc);
  306. ckptSetValue(p, iOut++, pLevel->nRight, pRc);
  307. ckptExportSegment(&pLevel->lhs, p, &iOut, pRc);
  308. assert( (pLevel->nRight>0)==(pMerge!=0) );
  309. if( pMerge ){
  310. int i;
  311. for(i=0; i<pLevel->nRight; i++){
  312. ckptExportSegment(&pLevel->aRhs[i], p, &iOut, pRc);
  313. }
  314. assert( pMerge->nInput==pLevel->nRight
  315. || pMerge->nInput==pLevel->nRight+1
  316. );
  317. ckptSetValue(p, iOut++, pMerge->nInput, pRc);
  318. ckptSetValue(p, iOut++, pMerge->nSkip, pRc);
  319. for(i=0; i<pMerge->nInput; i++){
  320. ckptAppend64(p, &iOut, pMerge->aInput[i].iPg, pRc);
  321. ckptSetValue(p, iOut++, pMerge->aInput[i].iCell, pRc);
  322. }
  323. ckptAppend64(p, &iOut, pMerge->splitkey.iPg, pRc);
  324. ckptSetValue(p, iOut++, pMerge->splitkey.iCell, pRc);
  325. ckptAppend64(p, &iOut, pMerge->iCurrentPtr, pRc);
  326. }
  327. *piOut = iOut;
  328. }
  329. /*
  330. ** Populate the log offset fields of the checkpoint buffer. 4 values.
  331. */
  332. static void ckptExportLog(
  333. lsm_db *pDb,
  334. int bFlush,
  335. CkptBuffer *p,
  336. int *piOut,
  337. int *pRc
  338. ){
  339. int iOut = *piOut;
  340. assert( iOut==CKPT_HDR_LO_MSW );
  341. if( bFlush ){
  342. i64 iOff = pDb->treehdr.iOldLog;
  343. ckptAppend64(p, &iOut, iOff, pRc);
  344. ckptSetValue(p, iOut++, pDb->treehdr.oldcksum0, pRc);
  345. ckptSetValue(p, iOut++, pDb->treehdr.oldcksum1, pRc);
  346. }else{
  347. for(; iOut<=CKPT_HDR_LO_CKSUM2; iOut++){
  348. ckptSetValue(p, iOut, pDb->pShmhdr->aSnap2[iOut], pRc);
  349. }
  350. }
  351. assert( *pRc || iOut==CKPT_HDR_LO_CKSUM2+1 );
  352. *piOut = iOut;
  353. }
  354. static void ckptExportAppendlist(
  355. lsm_db *db, /* Database connection */
  356. CkptBuffer *p, /* Checkpoint buffer to write to */
  357. int *piOut, /* IN/OUT: Offset within checkpoint buffer */
  358. int *pRc /* IN/OUT: Error code */
  359. ){
  360. int i;
  361. LsmPgno *aiAppend = db->pWorker->aiAppend;
  362. for(i=0; i<LSM_APPLIST_SZ; i++){
  363. ckptAppend64(p, piOut, aiAppend[i], pRc);
  364. }
  365. };
  366. static int ckptExportSnapshot(
  367. lsm_db *pDb, /* Connection handle */
  368. int bLog, /* True to update log-offset fields */
  369. i64 iId, /* Checkpoint id */
  370. int bCksum, /* If true, include checksums */
  371. void **ppCkpt, /* OUT: Buffer containing checkpoint */
  372. int *pnCkpt /* OUT: Size of checkpoint in bytes */
  373. ){
  374. int rc = LSM_OK; /* Return Code */
  375. FileSystem *pFS = pDb->pFS; /* File system object */
  376. Snapshot *pSnap = pDb->pWorker; /* Worker snapshot */
  377. int nLevel = 0; /* Number of levels in checkpoint */
  378. int iLevel; /* Used to count out nLevel levels */
  379. int iOut = 0; /* Current offset in aCkpt[] */
  380. Level *pLevel; /* Level iterator */
  381. int i; /* Iterator used while serializing freelist */
  382. CkptBuffer ckpt;
  383. /* Initialize the output buffer */
  384. memset(&ckpt, 0, sizeof(CkptBuffer));
  385. ckpt.pEnv = pDb->pEnv;
  386. iOut = CKPT_HDR_SIZE;
  387. /* Write the log offset into the checkpoint. */
  388. ckptExportLog(pDb, bLog, &ckpt, &iOut, &rc);
  389. /* Write the append-point list */
  390. ckptExportAppendlist(pDb, &ckpt, &iOut, &rc);
  391. /* Figure out how many levels will be written to the checkpoint. */
  392. for(pLevel=lsmDbSnapshotLevel(pSnap); pLevel; pLevel=pLevel->pNext) nLevel++;
  393. /* Serialize nLevel levels. */
  394. iLevel = 0;
  395. for(pLevel=lsmDbSnapshotLevel(pSnap); iLevel<nLevel; pLevel=pLevel->pNext){
  396. ckptExportLevel(pLevel, &ckpt, &iOut, &rc);
  397. iLevel++;
  398. }
  399. /* Write the block-redirect list */
  400. ckptSetValue(&ckpt, iOut++, pSnap->redirect.n, &rc);
  401. for(i=0; i<pSnap->redirect.n; i++){
  402. ckptSetValue(&ckpt, iOut++, pSnap->redirect.a[i].iFrom, &rc);
  403. ckptSetValue(&ckpt, iOut++, pSnap->redirect.a[i].iTo, &rc);
  404. }
  405. /* Write the freelist */
  406. assert( pSnap->freelist.nEntry<=pDb->nMaxFreelist );
  407. if( rc==LSM_OK ){
  408. int nFree = pSnap->freelist.nEntry;
  409. ckptSetValue(&ckpt, iOut++, nFree, &rc);
  410. for(i=0; i<nFree; i++){
  411. FreelistEntry *p = &pSnap->freelist.aEntry[i];
  412. ckptSetValue(&ckpt, iOut++, p->iBlk, &rc);
  413. ckptSetValue(&ckpt, iOut++, (p->iId >> 32) & 0xFFFFFFFF, &rc);
  414. ckptSetValue(&ckpt, iOut++, p->iId & 0xFFFFFFFF, &rc);
  415. }
  416. }
  417. /* Write the checkpoint header */
  418. assert( iId>=0 );
  419. assert( pSnap->iCmpId==pDb->compress.iId
  420. || pSnap->iCmpId==LSM_COMPRESSION_EMPTY
  421. );
  422. ckptSetValue(&ckpt, CKPT_HDR_ID_MSW, (u32)(iId>>32), &rc);
  423. ckptSetValue(&ckpt, CKPT_HDR_ID_LSW, (u32)(iId&0xFFFFFFFF), &rc);
  424. ckptSetValue(&ckpt, CKPT_HDR_NCKPT, iOut+2, &rc);
  425. ckptSetValue(&ckpt, CKPT_HDR_CMPID, pDb->compress.iId, &rc);
  426. ckptSetValue(&ckpt, CKPT_HDR_NBLOCK, pSnap->nBlock, &rc);
  427. ckptSetValue(&ckpt, CKPT_HDR_BLKSZ, lsmFsBlockSize(pFS), &rc);
  428. ckptSetValue(&ckpt, CKPT_HDR_NLEVEL, nLevel, &rc);
  429. ckptSetValue(&ckpt, CKPT_HDR_PGSZ, lsmFsPageSize(pFS), &rc);
  430. ckptSetValue(&ckpt, CKPT_HDR_NWRITE, pSnap->nWrite, &rc);
  431. if( bCksum ){
  432. ckptAddChecksum(&ckpt, iOut, &rc);
  433. }else{
  434. ckptSetValue(&ckpt, iOut, 0, &rc);
  435. ckptSetValue(&ckpt, iOut+1, 0, &rc);
  436. }
  437. iOut += 2;
  438. assert( iOut<=1024 );
  439. #ifdef LSM_LOG_FREELIST
  440. lsmLogMessage(pDb, rc,
  441. "ckptExportSnapshot(): id=%lld freelist: %d", iId, pSnap->freelist.nEntry
  442. );
  443. for(i=0; i<pSnap->freelist.nEntry; i++){
  444. lsmLogMessage(pDb, rc,
  445. "ckptExportSnapshot(): iBlk=%d id=%lld",
  446. pSnap->freelist.aEntry[i].iBlk,
  447. pSnap->freelist.aEntry[i].iId
  448. );
  449. }
  450. #endif
  451. *ppCkpt = (void *)ckpt.aCkpt;
  452. if( pnCkpt ) *pnCkpt = sizeof(u32)*iOut;
  453. return rc;
  454. }
  455. /*
  456. ** Helper function for ckptImport().
  457. */
  458. static void ckptNewSegment(
  459. u32 *aIn,
  460. int *piIn,
  461. Segment *pSegment /* Populate this structure */
  462. ){
  463. assert( pSegment->iFirst==0 && pSegment->iLastPg==0 );
  464. assert( pSegment->nSize==0 && pSegment->iRoot==0 );
  465. pSegment->iFirst = ckptGobble64(aIn, piIn);
  466. pSegment->iLastPg = ckptGobble64(aIn, piIn);
  467. pSegment->iRoot = ckptGobble64(aIn, piIn);
  468. pSegment->nSize = ckptGobble64(aIn, piIn);
  469. assert( pSegment->iFirst );
  470. }
  471. static int ckptSetupMerge(lsm_db *pDb, u32 *aInt, int *piIn, Level *pLevel){
  472. Merge *pMerge; /* Allocated Merge object */
  473. int nInput; /* Number of input segments in merge */
  474. int iIn = *piIn; /* Next value to read from aInt[] */
  475. int i; /* Iterator variable */
  476. int nByte; /* Number of bytes to allocate */
  477. /* Allocate the Merge object. If malloc() fails, return LSM_NOMEM. */
  478. nInput = (int)aInt[iIn++];
  479. nByte = sizeof(Merge) + sizeof(MergeInput) * nInput;
  480. pMerge = (Merge *)lsmMallocZero(pDb->pEnv, nByte);
  481. if( !pMerge ) return LSM_NOMEM_BKPT;
  482. pLevel->pMerge = pMerge;
  483. /* Populate the Merge object. */
  484. pMerge->aInput = (MergeInput *)&pMerge[1];
  485. pMerge->nInput = nInput;
  486. pMerge->iOutputOff = -1;
  487. pMerge->nSkip = (int)aInt[iIn++];
  488. for(i=0; i<nInput; i++){
  489. pMerge->aInput[i].iPg = ckptGobble64(aInt, &iIn);
  490. pMerge->aInput[i].iCell = (int)aInt[iIn++];
  491. }
  492. pMerge->splitkey.iPg = ckptGobble64(aInt, &iIn);
  493. pMerge->splitkey.iCell = (int)aInt[iIn++];
  494. pMerge->iCurrentPtr = ckptGobble64(aInt, &iIn);
  495. /* Set *piIn and return LSM_OK. */
  496. *piIn = iIn;
  497. return LSM_OK;
  498. }
  499. static int ckptLoadLevels(
  500. lsm_db *pDb,
  501. u32 *aIn,
  502. int *piIn,
  503. int nLevel,
  504. Level **ppLevel
  505. ){
  506. int i;
  507. int rc = LSM_OK;
  508. Level *pRet = 0;
  509. Level **ppNext;
  510. int iIn = *piIn;
  511. ppNext = &pRet;
  512. for(i=0; rc==LSM_OK && i<nLevel; i++){
  513. int iRight;
  514. Level *pLevel;
  515. /* Allocate space for the Level structure and Level.apRight[] array */
  516. pLevel = (Level *)lsmMallocZeroRc(pDb->pEnv, sizeof(Level), &rc);
  517. if( rc==LSM_OK ){
  518. pLevel->iAge = (u16)(aIn[iIn] & 0x0000FFFF);
  519. pLevel->flags = (u16)((aIn[iIn]>>16) & 0x0000FFFF);
  520. iIn++;
  521. pLevel->nRight = aIn[iIn++];
  522. if( pLevel->nRight ){
  523. int nByte = sizeof(Segment) * pLevel->nRight;
  524. pLevel->aRhs = (Segment *)lsmMallocZeroRc(pDb->pEnv, nByte, &rc);
  525. }
  526. if( rc==LSM_OK ){
  527. *ppNext = pLevel;
  528. ppNext = &pLevel->pNext;
  529. /* Allocate the main segment */
  530. ckptNewSegment(aIn, &iIn, &pLevel->lhs);
  531. /* Allocate each of the right-hand segments, if any */
  532. for(iRight=0; iRight<pLevel->nRight; iRight++){
  533. ckptNewSegment(aIn, &iIn, &pLevel->aRhs[iRight]);
  534. }
  535. /* Set up the Merge object, if required */
  536. if( pLevel->nRight>0 ){
  537. rc = ckptSetupMerge(pDb, aIn, &iIn, pLevel);
  538. }
  539. }
  540. }
  541. }
  542. if( rc!=LSM_OK ){
  543. /* An OOM must have occurred. Free any level structures allocated and
  544. ** return the error to the caller. */
  545. lsmSortedFreeLevel(pDb->pEnv, pRet);
  546. pRet = 0;
  547. }
  548. *ppLevel = pRet;
  549. *piIn = iIn;
  550. return rc;
  551. }
  552. int lsmCheckpointLoadLevels(lsm_db *pDb, void *pVal, int nVal){
  553. int rc = LSM_OK;
  554. if( nVal>0 ){
  555. u32 *aIn;
  556. aIn = lsmMallocRc(pDb->pEnv, nVal, &rc);
  557. if( aIn ){
  558. Level *pLevel = 0;
  559. Level *pParent;
  560. int nIn;
  561. int nLevel;
  562. int iIn = 1;
  563. memcpy(aIn, pVal, nVal);
  564. nIn = nVal / sizeof(u32);
  565. ckptChangeEndianness(aIn, nIn);
  566. nLevel = aIn[0];
  567. rc = ckptLoadLevels(pDb, aIn, &iIn, nLevel, &pLevel);
  568. lsmFree(pDb->pEnv, aIn);
  569. assert( rc==LSM_OK || pLevel==0 );
  570. if( rc==LSM_OK ){
  571. pParent = lsmDbSnapshotLevel(pDb->pWorker);
  572. assert( pParent );
  573. while( pParent->pNext ) pParent = pParent->pNext;
  574. pParent->pNext = pLevel;
  575. }
  576. }
  577. }
  578. return rc;
  579. }
  580. /*
  581. ** Return the data for the LEVELS record.
  582. **
  583. ** The size of the checkpoint that can be stored in the database header
  584. ** must not exceed 1024 32-bit integers. Normally, it does not. However,
  585. ** if it does, part of the checkpoint must be stored in the LSM. This
  586. ** routine returns that part.
  587. */
  588. int lsmCheckpointLevels(
  589. lsm_db *pDb, /* Database handle */
  590. int nLevel, /* Number of levels to write to blob */
  591. void **paVal, /* OUT: Pointer to LEVELS blob */
  592. int *pnVal /* OUT: Size of LEVELS blob in bytes */
  593. ){
  594. Level *p; /* Used to iterate through levels */
  595. int nAll= 0;
  596. int rc;
  597. int i;
  598. int iOut;
  599. CkptBuffer ckpt;
  600. assert( nLevel>0 );
  601. for(p=lsmDbSnapshotLevel(pDb->pWorker); p; p=p->pNext) nAll++;
  602. assert( nAll>nLevel );
  603. nAll -= nLevel;
  604. for(p=lsmDbSnapshotLevel(pDb->pWorker); p && nAll>0; p=p->pNext) nAll--;
  605. memset(&ckpt, 0, sizeof(CkptBuffer));
  606. ckpt.pEnv = pDb->pEnv;
  607. ckptSetValue(&ckpt, 0, nLevel, &rc);
  608. iOut = 1;
  609. for(i=0; rc==LSM_OK && i<nLevel; i++){
  610. ckptExportLevel(p, &ckpt, &iOut, &rc);
  611. p = p->pNext;
  612. }
  613. assert( rc!=LSM_OK || p==0 );
  614. if( rc==LSM_OK ){
  615. ckptChangeEndianness(ckpt.aCkpt, iOut);
  616. *paVal = (void *)ckpt.aCkpt;
  617. *pnVal = iOut * sizeof(u32);
  618. }else{
  619. *pnVal = 0;
  620. *paVal = 0;
  621. }
  622. return rc;
  623. }
  624. /*
  625. ** Read the checkpoint id from meta-page pPg.
  626. */
  627. static i64 ckptLoadId(MetaPage *pPg){
  628. i64 ret = 0;
  629. if( pPg ){
  630. int nData;
  631. u8 *aData = lsmFsMetaPageData(pPg, &nData);
  632. ret = (((i64)lsmGetU32(&aData[CKPT_HDR_ID_MSW*4])) << 32) +
  633. ((i64)lsmGetU32(&aData[CKPT_HDR_ID_LSW*4]));
  634. }
  635. return ret;
  636. }
  637. /*
  638. ** Return true if the buffer passed as an argument contains a valid
  639. ** checkpoint.
  640. */
  641. static int ckptChecksumOk(u32 *aCkpt){
  642. u32 nCkpt = aCkpt[CKPT_HDR_NCKPT];
  643. u32 cksum1;
  644. u32 cksum2;
  645. if( nCkpt<CKPT_HDR_NCKPT || nCkpt>(LSM_META_RW_PAGE_SIZE)/sizeof(u32) ){
  646. return 0;
  647. }
  648. ckptChecksum(aCkpt, nCkpt, &cksum1, &cksum2);
  649. return (cksum1==aCkpt[nCkpt-2] && cksum2==aCkpt[nCkpt-1]);
  650. }
  651. /*
  652. ** Attempt to load a checkpoint from meta page iMeta.
  653. **
  654. ** This function is a no-op if *pRc is set to any value other than LSM_OK
  655. ** when it is called. If an error occurs, *pRc is set to an LSM error code
  656. ** before returning.
  657. **
  658. ** If no error occurs and the checkpoint is successfully loaded, copy it to
  659. ** ShmHeader.aSnap1[] and ShmHeader.aSnap2[], and set ShmHeader.iMetaPage
  660. ** to indicate its origin. In this case return 1. Or, if the checkpoint
  661. ** cannot be loaded (because the checksum does not compute), return 0.
  662. */
  663. static int ckptTryLoad(lsm_db *pDb, MetaPage *pPg, u32 iMeta, int *pRc){
  664. int bLoaded = 0; /* Return value */
  665. if( *pRc==LSM_OK ){
  666. int rc = LSM_OK; /* Error code */
  667. u32 *aCkpt = 0; /* Pointer to buffer containing checkpoint */
  668. u32 nCkpt; /* Number of elements in aCkpt[] */
  669. int nData; /* Bytes of data in aData[] */
  670. u8 *aData; /* Meta page data */
  671. aData = lsmFsMetaPageData(pPg, &nData);
  672. nCkpt = (u32)lsmGetU32(&aData[CKPT_HDR_NCKPT*sizeof(u32)]);
  673. if( nCkpt<=nData/sizeof(u32) && nCkpt>CKPT_HDR_NCKPT ){
  674. aCkpt = (u32 *)lsmMallocRc(pDb->pEnv, nCkpt*sizeof(u32), &rc);
  675. }
  676. if( aCkpt ){
  677. memcpy(aCkpt, aData, nCkpt*sizeof(u32));
  678. ckptChangeEndianness(aCkpt, nCkpt);
  679. if( ckptChecksumOk(aCkpt) ){
  680. ShmHeader *pShm = pDb->pShmhdr;
  681. memcpy(pShm->aSnap1, aCkpt, nCkpt*sizeof(u32));
  682. memcpy(pShm->aSnap2, aCkpt, nCkpt*sizeof(u32));
  683. memcpy(pDb->aSnapshot, aCkpt, nCkpt*sizeof(u32));
  684. pShm->iMetaPage = iMeta;
  685. bLoaded = 1;
  686. }
  687. }
  688. lsmFree(pDb->pEnv, aCkpt);
  689. *pRc = rc;
  690. }
  691. return bLoaded;
  692. }
  693. /*
  694. ** Initialize the shared-memory header with an empty snapshot. This function
  695. ** is called when no valid snapshot can be found in the database header.
  696. */
  697. static void ckptLoadEmpty(lsm_db *pDb){
  698. u32 aCkpt[] = {
  699. 0, /* CKPT_HDR_ID_MSW */
  700. 10, /* CKPT_HDR_ID_LSW */
  701. 0, /* CKPT_HDR_NCKPT */
  702. LSM_COMPRESSION_EMPTY, /* CKPT_HDR_CMPID */
  703. 0, /* CKPT_HDR_NBLOCK */
  704. 0, /* CKPT_HDR_BLKSZ */
  705. 0, /* CKPT_HDR_NLEVEL */
  706. 0, /* CKPT_HDR_PGSZ */
  707. 0, /* CKPT_HDR_NWRITE */
  708. 0, 0, 1234, 5678, /* The log pointer and initial checksum */
  709. 0,0,0,0, 0,0,0,0, /* The append list */
  710. 0, /* The redirected block list */
  711. 0, /* The free block list */
  712. 0, 0 /* Space for checksum values */
  713. };
  714. u32 nCkpt = array_size(aCkpt);
  715. ShmHeader *pShm = pDb->pShmhdr;
  716. aCkpt[CKPT_HDR_NCKPT] = nCkpt;
  717. aCkpt[CKPT_HDR_BLKSZ] = pDb->nDfltBlksz;
  718. aCkpt[CKPT_HDR_PGSZ] = pDb->nDfltPgsz;
  719. ckptChecksum(aCkpt, array_size(aCkpt), &aCkpt[nCkpt-2], &aCkpt[nCkpt-1]);
  720. memcpy(pShm->aSnap1, aCkpt, nCkpt*sizeof(u32));
  721. memcpy(pShm->aSnap2, aCkpt, nCkpt*sizeof(u32));
  722. memcpy(pDb->aSnapshot, aCkpt, nCkpt*sizeof(u32));
  723. }
  724. /*
  725. ** This function is called as part of database recovery to initialize the
  726. ** ShmHeader.aSnap1[] and ShmHeader.aSnap2[] snapshots.
  727. */
  728. int lsmCheckpointRecover(lsm_db *pDb){
  729. int rc = LSM_OK; /* Return Code */
  730. i64 iId1; /* Id of checkpoint on meta-page 1 */
  731. i64 iId2; /* Id of checkpoint on meta-page 2 */
  732. int bLoaded = 0; /* True once checkpoint has been loaded */
  733. int cmp; /* True if (iId2>iId1) */
  734. MetaPage *apPg[2] = {0, 0}; /* Meta-pages 1 and 2 */
  735. rc = lsmFsMetaPageGet(pDb->pFS, 0, 1, &apPg[0]);
  736. if( rc==LSM_OK ) rc = lsmFsMetaPageGet(pDb->pFS, 0, 2, &apPg[1]);
  737. iId1 = ckptLoadId(apPg[0]);
  738. iId2 = ckptLoadId(apPg[1]);
  739. cmp = (iId2 > iId1);
  740. bLoaded = ckptTryLoad(pDb, apPg[cmp?1:0], (cmp?2:1), &rc);
  741. if( bLoaded==0 ){
  742. bLoaded = ckptTryLoad(pDb, apPg[cmp?0:1], (cmp?1:2), &rc);
  743. }
  744. /* The database does not contain a valid checkpoint. Initialize the shared
  745. ** memory header with an empty checkpoint. */
  746. if( bLoaded==0 ){
  747. ckptLoadEmpty(pDb);
  748. }
  749. lsmFsMetaPageRelease(apPg[0]);
  750. lsmFsMetaPageRelease(apPg[1]);
  751. return rc;
  752. }
  753. /*
  754. ** Store the snapshot in pDb->aSnapshot[] in meta-page iMeta.
  755. */
  756. int lsmCheckpointStore(lsm_db *pDb, int iMeta){
  757. MetaPage *pPg = 0;
  758. int rc;
  759. assert( iMeta==1 || iMeta==2 );
  760. rc = lsmFsMetaPageGet(pDb->pFS, 1, iMeta, &pPg);
  761. if( rc==LSM_OK ){
  762. u8 *aData;
  763. int nData;
  764. int nCkpt;
  765. nCkpt = (int)pDb->aSnapshot[CKPT_HDR_NCKPT];
  766. aData = lsmFsMetaPageData(pPg, &nData);
  767. memcpy(aData, pDb->aSnapshot, nCkpt*sizeof(u32));
  768. ckptChangeEndianness((u32 *)aData, nCkpt);
  769. rc = lsmFsMetaPageRelease(pPg);
  770. }
  771. return rc;
  772. }
  773. /*
  774. ** Copy the current client snapshot from shared-memory to pDb->aSnapshot[].
  775. */
  776. int lsmCheckpointLoad(lsm_db *pDb, int *piRead){
  777. int nRem = LSM_ATTEMPTS_BEFORE_PROTOCOL;
  778. ShmHeader *pShm = pDb->pShmhdr;
  779. while( (nRem--)>0 ){
  780. int nInt;
  781. nInt = pShm->aSnap1[CKPT_HDR_NCKPT];
  782. if( nInt<=(LSM_META_RW_PAGE_SIZE / sizeof(u32)) ){
  783. memcpy(pDb->aSnapshot, pShm->aSnap1, nInt*sizeof(u32));
  784. if( ckptChecksumOk(pDb->aSnapshot) ){
  785. if( piRead ) *piRead = 1;
  786. return LSM_OK;
  787. }
  788. }
  789. nInt = pShm->aSnap2[CKPT_HDR_NCKPT];
  790. if( nInt<=(LSM_META_RW_PAGE_SIZE / sizeof(u32)) ){
  791. memcpy(pDb->aSnapshot, pShm->aSnap2, nInt*sizeof(u32));
  792. if( ckptChecksumOk(pDb->aSnapshot) ){
  793. if( piRead ) *piRead = 2;
  794. return LSM_OK;
  795. }
  796. }
  797. lsmShmBarrier(pDb);
  798. }
  799. return LSM_PROTOCOL_BKPT;
  800. }
  801. int lsmInfoCompressionId(lsm_db *db, u32 *piCmpId){
  802. int rc;
  803. assert( db->pClient==0 && db->pWorker==0 );
  804. rc = lsmCheckpointLoad(db, 0);
  805. if( rc==LSM_OK ){
  806. *piCmpId = db->aSnapshot[CKPT_HDR_CMPID];
  807. }
  808. return rc;
  809. }
  810. int lsmCheckpointLoadOk(lsm_db *pDb, int iSnap){
  811. u32 *aShm;
  812. assert( iSnap==1 || iSnap==2 );
  813. aShm = (iSnap==1) ? pDb->pShmhdr->aSnap1 : pDb->pShmhdr->aSnap2;
  814. return (lsmCheckpointId(pDb->aSnapshot, 0)==lsmCheckpointId(aShm, 0) );
  815. }
  816. int lsmCheckpointClientCacheOk(lsm_db *pDb){
  817. return ( pDb->pClient
  818. && pDb->pClient->iId==lsmCheckpointId(pDb->aSnapshot, 0)
  819. && pDb->pClient->iId==lsmCheckpointId(pDb->pShmhdr->aSnap1, 0)
  820. && pDb->pClient->iId==lsmCheckpointId(pDb->pShmhdr->aSnap2, 0)
  821. );
  822. }
  823. int lsmCheckpointLoadWorker(lsm_db *pDb){
  824. int rc;
  825. ShmHeader *pShm = pDb->pShmhdr;
  826. int nInt1;
  827. int nInt2;
  828. /* Must be holding the WORKER lock to do this. Or DMS2. */
  829. assert(
  830. lsmShmAssertLock(pDb, LSM_LOCK_WORKER, LSM_LOCK_EXCL)
  831. || lsmShmAssertLock(pDb, LSM_LOCK_DMS1, LSM_LOCK_EXCL)
  832. );
  833. /* Check that the two snapshots match. If not, repair them. */
  834. nInt1 = pShm->aSnap1[CKPT_HDR_NCKPT];
  835. nInt2 = pShm->aSnap2[CKPT_HDR_NCKPT];
  836. if( nInt1!=nInt2 || memcmp(pShm->aSnap1, pShm->aSnap2, nInt2*sizeof(u32)) ){
  837. if( ckptChecksumOk(pShm->aSnap1) ){
  838. memcpy(pShm->aSnap2, pShm->aSnap1, sizeof(u32)*nInt1);
  839. }else if( ckptChecksumOk(pShm->aSnap2) ){
  840. memcpy(pShm->aSnap1, pShm->aSnap2, sizeof(u32)*nInt2);
  841. }else{
  842. return LSM_PROTOCOL_BKPT;
  843. }
  844. }
  845. rc = lsmCheckpointDeserialize(pDb, 1, pShm->aSnap1, &pDb->pWorker);
  846. if( pDb->pWorker ) pDb->pWorker->pDatabase = pDb->pDatabase;
  847. if( rc==LSM_OK ){
  848. rc = lsmCheckCompressionId(pDb, pDb->pWorker->iCmpId);
  849. }
  850. #if 0
  851. assert( rc!=LSM_OK || lsmFsIntegrityCheck(pDb) );
  852. #endif
  853. return rc;
  854. }
  855. int lsmCheckpointDeserialize(
  856. lsm_db *pDb,
  857. int bInclFreelist, /* If true, deserialize free-list */
  858. u32 *aCkpt,
  859. Snapshot **ppSnap
  860. ){
  861. int rc = LSM_OK;
  862. Snapshot *pNew;
  863. pNew = (Snapshot *)lsmMallocZeroRc(pDb->pEnv, sizeof(Snapshot), &rc);
  864. if( rc==LSM_OK ){
  865. Level *pLvl;
  866. int nFree;
  867. int i;
  868. int nLevel = (int)aCkpt[CKPT_HDR_NLEVEL];
  869. int iIn = CKPT_HDR_SIZE + CKPT_APPENDLIST_SIZE + CKPT_LOGPTR_SIZE;
  870. pNew->iId = lsmCheckpointId(aCkpt, 0);
  871. pNew->nBlock = aCkpt[CKPT_HDR_NBLOCK];
  872. pNew->nWrite = aCkpt[CKPT_HDR_NWRITE];
  873. rc = ckptLoadLevels(pDb, aCkpt, &iIn, nLevel, &pNew->pLevel);
  874. pNew->iLogOff = lsmCheckpointLogOffset(aCkpt);
  875. pNew->iCmpId = aCkpt[CKPT_HDR_CMPID];
  876. /* Make a copy of the append-list */
  877. for(i=0; i<LSM_APPLIST_SZ; i++){
  878. u32 *a = &aCkpt[CKPT_HDR_SIZE + CKPT_LOGPTR_SIZE + i*2];
  879. pNew->aiAppend[i] = ckptRead64(a);
  880. }
  881. /* Read the block-redirect list */
  882. pNew->redirect.n = aCkpt[iIn++];
  883. if( pNew->redirect.n ){
  884. pNew->redirect.a = lsmMallocZeroRc(pDb->pEnv,
  885. (sizeof(struct RedirectEntry) * LSM_MAX_BLOCK_REDIRECTS), &rc
  886. );
  887. if( rc==LSM_OK ){
  888. for(i=0; i<pNew->redirect.n; i++){
  889. pNew->redirect.a[i].iFrom = aCkpt[iIn++];
  890. pNew->redirect.a[i].iTo = aCkpt[iIn++];
  891. }
  892. }
  893. for(pLvl=pNew->pLevel; pLvl->pNext; pLvl=pLvl->pNext);
  894. if( pLvl->nRight ){
  895. pLvl->aRhs[pLvl->nRight-1].pRedirect = &pNew->redirect;
  896. }else{
  897. pLvl->lhs.pRedirect = &pNew->redirect;
  898. }
  899. }
  900. /* Copy the free-list */
  901. if( rc==LSM_OK && bInclFreelist ){
  902. nFree = aCkpt[iIn++];
  903. if( nFree ){
  904. pNew->freelist.aEntry = (FreelistEntry *)lsmMallocZeroRc(
  905. pDb->pEnv, sizeof(FreelistEntry)*nFree, &rc
  906. );
  907. if( rc==LSM_OK ){
  908. int j;
  909. for(j=0; j<nFree; j++){
  910. FreelistEntry *p = &pNew->freelist.aEntry[j];
  911. p->iBlk = aCkpt[iIn++];
  912. p->iId = ((i64)(aCkpt[iIn])<<32) + aCkpt[iIn+1];
  913. iIn += 2;
  914. }
  915. pNew->freelist.nEntry = pNew->freelist.nAlloc = nFree;
  916. }
  917. }
  918. }
  919. }
  920. if( rc!=LSM_OK ){
  921. lsmFreeSnapshot(pDb->pEnv, pNew);
  922. pNew = 0;
  923. }
  924. *ppSnap = pNew;
  925. return rc;
  926. }
  927. /*
  928. ** Connection pDb must be the worker connection in order to call this
  929. ** function. It returns true if the database already contains the maximum
  930. ** number of levels or false otherwise.
  931. **
  932. ** This is used when flushing the in-memory tree to disk. If the database
  933. ** is already full, then the caller should invoke lsm_work() or similar
  934. ** until it is not full before creating a new level by flushing the in-memory
  935. ** tree to disk. Limiting the number of levels in the database ensures that
  936. ** the records describing them always fit within the checkpoint blob.
  937. */
  938. int lsmDatabaseFull(lsm_db *pDb){
  939. Level *p;
  940. int nRhs = 0;
  941. assert( lsmShmAssertLock(pDb, LSM_LOCK_WORKER, LSM_LOCK_EXCL) );
  942. assert( pDb->pWorker );
  943. for(p=pDb->pWorker->pLevel; p; p=p->pNext){
  944. nRhs += (p->nRight ? p->nRight : 1);
  945. }
  946. return (nRhs >= LSM_MAX_RHS_SEGMENTS);
  947. }
  948. /*
  949. ** The connection passed as the only argument is currently the worker
  950. ** connection. Some work has been performed on the database by the connection,
  951. ** but no new snapshot has been written into shared memory.
  952. **
  953. ** This function updates the shared-memory worker and client snapshots with
  954. ** the new snapshot produced by the work performed by pDb.
  955. **
  956. ** If successful, LSM_OK is returned. Otherwise, if an error occurs, an LSM
  957. ** error code is returned.
  958. */
  959. int lsmCheckpointSaveWorker(lsm_db *pDb, int bFlush){
  960. Snapshot *pSnap = pDb->pWorker;
  961. ShmHeader *pShm = pDb->pShmhdr;
  962. void *p = 0;
  963. int n = 0;
  964. int rc;
  965. pSnap->iId++;
  966. rc = ckptExportSnapshot(pDb, bFlush, pSnap->iId, 1, &p, &n);
  967. if( rc!=LSM_OK ) return rc;
  968. assert( ckptChecksumOk((u32 *)p) );
  969. assert( n<=LSM_META_RW_PAGE_SIZE );
  970. memcpy(pShm->aSnap2, p, n);
  971. lsmShmBarrier(pDb);
  972. memcpy(pShm->aSnap1, p, n);
  973. lsmFree(pDb->pEnv, p);
  974. /* assert( lsmFsIntegrityCheck(pDb) ); */
  975. return LSM_OK;
  976. }
  977. /*
  978. ** This function is used to determine the snapshot-id of the most recently
  979. ** checkpointed snapshot. Variable ShmHeader.iMetaPage indicates which of
  980. ** the two meta-pages said snapshot resides on (if any).
  981. **
  982. ** If successful, this function loads the snapshot from the meta-page,
  983. ** verifies its checksum and sets *piId to the snapshot-id before returning
  984. ** LSM_OK. Or, if the checksum attempt fails, *piId is set to zero and
  985. ** LSM_OK returned. If an error occurs, an LSM error code is returned and
  986. ** the final value of *piId is undefined.
  987. */
  988. int lsmCheckpointSynced(lsm_db *pDb, i64 *piId, i64 *piLog, u32 *pnWrite){
  989. int rc = LSM_OK;
  990. MetaPage *pPg;
  991. u32 iMeta;
  992. iMeta = pDb->pShmhdr->iMetaPage;
  993. if( iMeta==1 || iMeta==2 ){
  994. rc = lsmFsMetaPageGet(pDb->pFS, 0, iMeta, &pPg);
  995. if( rc==LSM_OK ){
  996. int nCkpt;
  997. int nData;
  998. u8 *aData;
  999. aData = lsmFsMetaPageData(pPg, &nData);
  1000. assert( nData==LSM_META_RW_PAGE_SIZE );
  1001. nCkpt = lsmGetU32(&aData[CKPT_HDR_NCKPT*sizeof(u32)]);
  1002. if( nCkpt<(LSM_META_RW_PAGE_SIZE/sizeof(u32)) ){
  1003. u32 *aCopy = lsmMallocRc(pDb->pEnv, sizeof(u32) * nCkpt, &rc);
  1004. if( aCopy ){
  1005. memcpy(aCopy, aData, nCkpt*sizeof(u32));
  1006. ckptChangeEndianness(aCopy, nCkpt);
  1007. if( ckptChecksumOk(aCopy) ){
  1008. if( piId ) *piId = lsmCheckpointId(aCopy, 0);
  1009. if( piLog ) *piLog = (lsmCheckpointLogOffset(aCopy) >> 1);
  1010. if( pnWrite ) *pnWrite = aCopy[CKPT_HDR_NWRITE];
  1011. }
  1012. lsmFree(pDb->pEnv, aCopy);
  1013. }
  1014. }
  1015. lsmFsMetaPageRelease(pPg);
  1016. }
  1017. }
  1018. if( (iMeta!=1 && iMeta!=2) || rc!=LSM_OK || pDb->pShmhdr->iMetaPage!=iMeta ){
  1019. if( piId ) *piId = 0;
  1020. if( piLog ) *piLog = 0;
  1021. if( pnWrite ) *pnWrite = 0;
  1022. }
  1023. return rc;
  1024. }
  1025. /*
  1026. ** Return the checkpoint-id of the checkpoint array passed as the first
  1027. ** argument to this function. If the second argument is true, then assume
  1028. ** that the checkpoint is made up of 32-bit big-endian integers. If it
  1029. ** is false, assume that the integers are in machine byte order.
  1030. */
  1031. i64 lsmCheckpointId(u32 *aCkpt, int bDisk){
  1032. i64 iId;
  1033. if( bDisk ){
  1034. u8 *aData = (u8 *)aCkpt;
  1035. iId = (((i64)lsmGetU32(&aData[CKPT_HDR_ID_MSW*4])) << 32);
  1036. iId += ((i64)lsmGetU32(&aData[CKPT_HDR_ID_LSW*4]));
  1037. }else{
  1038. iId = ((i64)aCkpt[CKPT_HDR_ID_MSW] << 32) + (i64)aCkpt[CKPT_HDR_ID_LSW];
  1039. }
  1040. return iId;
  1041. }
  1042. u32 lsmCheckpointNBlock(u32 *aCkpt){
  1043. return aCkpt[CKPT_HDR_NBLOCK];
  1044. }
  1045. u32 lsmCheckpointNWrite(u32 *aCkpt, int bDisk){
  1046. if( bDisk ){
  1047. return lsmGetU32((u8 *)&aCkpt[CKPT_HDR_NWRITE]);
  1048. }else{
  1049. return aCkpt[CKPT_HDR_NWRITE];
  1050. }
  1051. }
  1052. i64 lsmCheckpointLogOffset(u32 *aCkpt){
  1053. return ((i64)aCkpt[CKPT_HDR_LO_MSW] << 32) + (i64)aCkpt[CKPT_HDR_LO_LSW];
  1054. }
  1055. int lsmCheckpointPgsz(u32 *aCkpt){ return (int)aCkpt[CKPT_HDR_PGSZ]; }
  1056. int lsmCheckpointBlksz(u32 *aCkpt){ return (int)aCkpt[CKPT_HDR_BLKSZ]; }
  1057. void lsmCheckpointLogoffset(
  1058. u32 *aCkpt,
  1059. DbLog *pLog
  1060. ){
  1061. pLog->aRegion[2].iStart = (lsmCheckpointLogOffset(aCkpt) >> 1);
  1062. pLog->cksum0 = aCkpt[CKPT_HDR_LO_CKSUM1];
  1063. pLog->cksum1 = aCkpt[CKPT_HDR_LO_CKSUM2];
  1064. pLog->iSnapshotId = lsmCheckpointId(aCkpt, 0);
  1065. }
  1066. void lsmCheckpointZeroLogoffset(lsm_db *pDb){
  1067. u32 nCkpt;
  1068. nCkpt = pDb->aSnapshot[CKPT_HDR_NCKPT];
  1069. assert( nCkpt>CKPT_HDR_NCKPT );
  1070. assert( nCkpt==pDb->pShmhdr->aSnap1[CKPT_HDR_NCKPT] );
  1071. assert( 0==memcmp(pDb->aSnapshot, pDb->pShmhdr->aSnap1, nCkpt*sizeof(u32)) );
  1072. assert( 0==memcmp(pDb->aSnapshot, pDb->pShmhdr->aSnap2, nCkpt*sizeof(u32)) );
  1073. pDb->aSnapshot[CKPT_HDR_LO_MSW] = 0;
  1074. pDb->aSnapshot[CKPT_HDR_LO_LSW] = 0;
  1075. ckptChecksum(pDb->aSnapshot, nCkpt,
  1076. &pDb->aSnapshot[nCkpt-2], &pDb->aSnapshot[nCkpt-1]
  1077. );
  1078. memcpy(pDb->pShmhdr->aSnap1, pDb->aSnapshot, nCkpt*sizeof(u32));
  1079. memcpy(pDb->pShmhdr->aSnap2, pDb->aSnapshot, nCkpt*sizeof(u32));
  1080. }
  1081. /*
  1082. ** Set the output variable to the number of KB of data written into the
  1083. ** database file since the most recent checkpoint.
  1084. */
  1085. int lsmCheckpointSize(lsm_db *db, int *pnKB){
  1086. int rc = LSM_OK;
  1087. u32 nSynced;
  1088. /* Set nSynced to the number of pages that had been written when the
  1089. ** database was last checkpointed. */
  1090. rc = lsmCheckpointSynced(db, 0, 0, &nSynced);
  1091. if( rc==LSM_OK ){
  1092. u32 nPgsz = db->pShmhdr->aSnap1[CKPT_HDR_PGSZ];
  1093. u32 nWrite = db->pShmhdr->aSnap1[CKPT_HDR_NWRITE];
  1094. *pnKB = (int)(( ((i64)(nWrite - nSynced) * nPgsz) + 1023) / 1024);
  1095. }
  1096. return rc;
  1097. }