percentile.c 15 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504
  1. /*
  2. ** 2013-05-28
  3. **
  4. ** The author disclaims copyright to this source code. In place of
  5. ** a legal notice, here is a blessing:
  6. **
  7. ** May you do good and not evil.
  8. ** May you find forgiveness for yourself and forgive others.
  9. ** May you share freely, never taking more than you give.
  10. **
  11. ******************************************************************************
  12. **
  13. ** This file contains code to implement the percentile(Y,P) SQL function
  14. ** and similar as described below:
  15. **
  16. ** (1) The percentile(Y,P) function is an aggregate function taking
  17. ** exactly two arguments.
  18. **
  19. ** (2) If the P argument to percentile(Y,P) is not the same for every
  20. ** row in the aggregate then an error is thrown. The word "same"
  21. ** in the previous sentence means that the value differ by less
  22. ** than 0.001.
  23. **
  24. ** (3) If the P argument to percentile(Y,P) evaluates to anything other
  25. ** than a number in the range of 0.0 to 100.0 inclusive then an
  26. ** error is thrown.
  27. **
  28. ** (4) If any Y argument to percentile(Y,P) evaluates to a value that
  29. ** is not NULL and is not numeric then an error is thrown.
  30. **
  31. ** (5) If any Y argument to percentile(Y,P) evaluates to plus or minus
  32. ** infinity then an error is thrown. (SQLite always interprets NaN
  33. ** values as NULL.)
  34. **
  35. ** (6) Both Y and P in percentile(Y,P) can be arbitrary expressions,
  36. ** including CASE WHEN expressions.
  37. **
  38. ** (7) The percentile(Y,P) aggregate is able to handle inputs of at least
  39. ** one million (1,000,000) rows.
  40. **
  41. ** (8) If there are no non-NULL values for Y, then percentile(Y,P)
  42. ** returns NULL.
  43. **
  44. ** (9) If there is exactly one non-NULL value for Y, the percentile(Y,P)
  45. ** returns the one Y value.
  46. **
  47. ** (10) If there N non-NULL values of Y where N is two or more and
  48. ** the Y values are ordered from least to greatest and a graph is
  49. ** drawn from 0 to N-1 such that the height of the graph at J is
  50. ** the J-th Y value and such that straight lines are drawn between
  51. ** adjacent Y values, then the percentile(Y,P) function returns
  52. ** the height of the graph at P*(N-1)/100.
  53. **
  54. ** (11) The percentile(Y,P) function always returns either a floating
  55. ** point number or NULL.
  56. **
  57. ** (12) The percentile(Y,P) is implemented as a single C99 source-code
  58. ** file that compiles into a shared-library or DLL that can be loaded
  59. ** into SQLite using the sqlite3_load_extension() interface.
  60. **
  61. ** (13) A separate median(Y) function is the equivalent percentile(Y,50).
  62. **
  63. ** (14) A separate percentile_cont(Y,P) function is equivalent to
  64. ** percentile(Y,P/100.0). In other words, the fraction value in
  65. ** the second argument is in the range of 0 to 1 instead of 0 to 100.
  66. **
  67. ** (15) A separate percentile_disc(Y,P) function is like
  68. ** percentile_cont(Y,P) except that instead of returning the weighted
  69. ** average of the nearest two input values, it returns the next lower
  70. ** value. So the percentile_disc(Y,P) will always return a value
  71. ** that was one of the inputs.
  72. **
  73. ** (16) All of median(), percentile(Y,P), percentile_cont(Y,P) and
  74. ** percentile_disc(Y,P) can be used as window functions.
  75. **
  76. ** Differences from standard SQL:
  77. **
  78. ** * The percentile_cont(X,P) function is equivalent to the following in
  79. ** standard SQL:
  80. **
  81. ** (percentile_cont(P) WITHIN GROUP (ORDER BY X))
  82. **
  83. ** The SQLite syntax is much more compact. The standard SQL syntax
  84. ** is also supported if SQLite is compiled with the
  85. ** -DSQLITE_ENABLE_ORDERED_SET_AGGREGATES option.
  86. **
  87. ** * No median(X) function exists in the SQL standard. App developers
  88. ** are expected to write "percentile_cont(0.5)WITHIN GROUP(ORDER BY X)".
  89. **
  90. ** * No percentile(Y,P) function exists in the SQL standard. Instead of
  91. ** percential(Y,P), developers must write this:
  92. ** "percentile_cont(P/100.0) WITHIN GROUP (ORDER BY Y)". Note that
  93. ** the fraction parameter to percentile() goes from 0 to 100 whereas
  94. ** the fraction parameter in SQL standard percentile_cont() goes from
  95. ** 0 to 1.
  96. **
  97. ** Implementation notes as of 2024-08-31:
  98. **
  99. ** * The regular aggregate-function versions of these routines work
  100. ** by accumulating all values in an array of doubles, then sorting
  101. ** that array using quicksort before computing the answer. Thus
  102. ** the runtime is O(NlogN) where N is the number of rows of input.
  103. **
  104. ** * For the window-function versions of these routines, the array of
  105. ** inputs is sorted as soon as the first value is computed. Thereafter,
  106. ** the array is kept in sorted order using an insert-sort. This
  107. ** results in O(N*K) performance where K is the size of the window.
  108. ** One can imagine alternative implementations that give O(N*logN*logK)
  109. ** performance, but they require more complex logic and data structures.
  110. ** The developers have elected to keep the asymptotically slower
  111. ** algorithm for now, for simplicity, under the theory that window
  112. ** functions are seldom used and when they are, the window size K is
  113. ** often small. The developers might revisit that decision later,
  114. ** should the need arise.
  115. */
  116. #if defined(SQLITE3_H)
  117. /* no-op */
  118. #elif defined(SQLITE_STATIC_PERCENTILE)
  119. # include "sqlite3.h"
  120. #else
  121. # include "sqlite3ext.h"
  122. SQLITE_EXTENSION_INIT1
  123. #endif
  124. #include <assert.h>
  125. #include <string.h>
  126. #include <stdlib.h>
  127. /* The following object is the group context for a single percentile()
  128. ** aggregate. Remember all input Y values until the very end.
  129. ** Those values are accumulated in the Percentile.a[] array.
  130. */
  131. typedef struct Percentile Percentile;
  132. struct Percentile {
  133. unsigned nAlloc; /* Number of slots allocated for a[] */
  134. unsigned nUsed; /* Number of slots actually used in a[] */
  135. char bSorted; /* True if a[] is already in sorted order */
  136. char bKeepSorted; /* True if advantageous to keep a[] sorted */
  137. char bPctValid; /* True if rPct is valid */
  138. double rPct; /* Fraction. 0.0 to 1.0 */
  139. double *a; /* Array of Y values */
  140. };
  141. /* Details of each function in the percentile family */
  142. typedef struct PercentileFunc PercentileFunc;
  143. struct PercentileFunc {
  144. const char *zName; /* Function name */
  145. char nArg; /* Number of arguments */
  146. char mxFrac; /* Maximum value of the "fraction" input */
  147. char bDiscrete; /* True for percentile_disc() */
  148. };
  149. static const PercentileFunc aPercentFunc[] = {
  150. { "median", 1, 1, 0 },
  151. { "percentile", 2, 100, 0 },
  152. { "percentile_cont", 2, 1, 0 },
  153. { "percentile_disc", 2, 1, 1 },
  154. };
  155. /*
  156. ** Return TRUE if the input floating-point number is an infinity.
  157. */
  158. static int percentIsInfinity(double r){
  159. sqlite3_uint64 u;
  160. assert( sizeof(u)==sizeof(r) );
  161. memcpy(&u, &r, sizeof(u));
  162. return ((u>>52)&0x7ff)==0x7ff;
  163. }
  164. /*
  165. ** Return TRUE if two doubles differ by 0.001 or less.
  166. */
  167. static int percentSameValue(double a, double b){
  168. a -= b;
  169. return a>=-0.001 && a<=0.001;
  170. }
  171. /*
  172. ** Search p (which must have p->bSorted) looking for an entry with
  173. ** value y. Return the index of that entry.
  174. **
  175. ** If bExact is true, return -1 if the entry is not found.
  176. **
  177. ** If bExact is false, return the index at which a new entry with
  178. ** value y should be insert in order to keep the values in sorted
  179. ** order. The smallest return value in this case will be 0, and
  180. ** the largest return value will be p->nUsed.
  181. */
  182. static int percentBinarySearch(Percentile *p, double y, int bExact){
  183. int iFirst = 0; /* First element of search range */
  184. int iLast = p->nUsed - 1; /* Last element of search range */
  185. while( iLast>=iFirst ){
  186. int iMid = (iFirst+iLast)/2;
  187. double x = p->a[iMid];
  188. if( x<y ){
  189. iFirst = iMid + 1;
  190. }else if( x>y ){
  191. iLast = iMid - 1;
  192. }else{
  193. return iMid;
  194. }
  195. }
  196. if( bExact ) return -1;
  197. return iFirst;
  198. }
  199. /*
  200. ** Generate an error for a percentile function.
  201. **
  202. ** The error format string must have exactly one occurrance of "%%s()"
  203. ** (with two '%' characters). That substring will be replaced by the name
  204. ** of the function.
  205. */
  206. static void percentError(sqlite3_context *pCtx, const char *zFormat, ...){
  207. PercentileFunc *pFunc = (PercentileFunc*)sqlite3_user_data(pCtx);
  208. char *zMsg1;
  209. char *zMsg2;
  210. va_list ap;
  211. va_start(ap, zFormat);
  212. zMsg1 = sqlite3_vmprintf(zFormat, ap);
  213. va_end(ap);
  214. zMsg2 = zMsg1 ? sqlite3_mprintf(zMsg1, pFunc->zName) : 0;
  215. sqlite3_result_error(pCtx, zMsg2, -1);
  216. sqlite3_free(zMsg1);
  217. sqlite3_free(zMsg2);
  218. }
  219. /*
  220. ** The "step" function for percentile(Y,P) is called once for each
  221. ** input row.
  222. */
  223. static void percentStep(sqlite3_context *pCtx, int argc, sqlite3_value **argv){
  224. Percentile *p;
  225. double rPct;
  226. int eType;
  227. double y;
  228. assert( argc==2 || argc==1 );
  229. if( argc==1 ){
  230. /* Requirement 13: median(Y) is the same as percentile(Y,50). */
  231. rPct = 0.5;
  232. }else{
  233. /* Requirement 3: P must be a number between 0 and 100 */
  234. PercentileFunc *pFunc = (PercentileFunc*)sqlite3_user_data(pCtx);
  235. eType = sqlite3_value_numeric_type(argv[1]);
  236. rPct = sqlite3_value_double(argv[1])/(double)pFunc->mxFrac;
  237. if( (eType!=SQLITE_INTEGER && eType!=SQLITE_FLOAT)
  238. || rPct<0.0 || rPct>1.0
  239. ){
  240. percentError(pCtx, "the fraction argument to %%s()"
  241. " is not between 0.0 and %.1f",
  242. (double)pFunc->mxFrac);
  243. return;
  244. }
  245. }
  246. /* Allocate the session context. */
  247. p = (Percentile*)sqlite3_aggregate_context(pCtx, sizeof(*p));
  248. if( p==0 ) return;
  249. /* Remember the P value. Throw an error if the P value is different
  250. ** from any prior row, per Requirement (2). */
  251. if( !p->bPctValid ){
  252. p->rPct = rPct;
  253. p->bPctValid = 1;
  254. }else if( !percentSameValue(p->rPct,rPct) ){
  255. percentError(pCtx, "the fraction argument to %%s()"
  256. " is not the same for all input rows");
  257. return;
  258. }
  259. /* Ignore rows for which Y is NULL */
  260. eType = sqlite3_value_type(argv[0]);
  261. if( eType==SQLITE_NULL ) return;
  262. /* If not NULL, then Y must be numeric. Otherwise throw an error.
  263. ** Requirement 4 */
  264. if( eType!=SQLITE_INTEGER && eType!=SQLITE_FLOAT ){
  265. percentError(pCtx, "input to %%s() is not numeric");
  266. return;
  267. }
  268. /* Throw an error if the Y value is infinity or NaN */
  269. y = sqlite3_value_double(argv[0]);
  270. if( percentIsInfinity(y) ){
  271. percentError(pCtx, "Inf input to %%s()");
  272. return;
  273. }
  274. /* Allocate and store the Y */
  275. if( p->nUsed>=p->nAlloc ){
  276. unsigned n = p->nAlloc*2 + 250;
  277. double *a = sqlite3_realloc64(p->a, sizeof(double)*n);
  278. if( a==0 ){
  279. sqlite3_free(p->a);
  280. memset(p, 0, sizeof(*p));
  281. sqlite3_result_error_nomem(pCtx);
  282. return;
  283. }
  284. p->nAlloc = n;
  285. p->a = a;
  286. }
  287. if( p->nUsed==0 ){
  288. p->a[p->nUsed++] = y;
  289. p->bSorted = 1;
  290. }else if( !p->bSorted || y>=p->a[p->nUsed-1] ){
  291. p->a[p->nUsed++] = y;
  292. }else if( p->bKeepSorted ){
  293. int i;
  294. i = percentBinarySearch(p, y, 0);
  295. if( i<(int)p->nUsed ){
  296. memmove(&p->a[i+1], &p->a[i], (p->nUsed-i)*sizeof(p->a[0]));
  297. }
  298. p->a[i] = y;
  299. p->nUsed++;
  300. }else{
  301. p->a[p->nUsed++] = y;
  302. p->bSorted = 0;
  303. }
  304. }
  305. /*
  306. ** Interchange two doubles.
  307. */
  308. #define SWAP_DOUBLE(X,Y) {double ttt=(X);(X)=(Y);(Y)=ttt;}
  309. /*
  310. ** Sort an array of doubles.
  311. **
  312. ** Algorithm: quicksort
  313. **
  314. ** This is implemented separately rather than using the qsort() routine
  315. ** from the standard library because:
  316. **
  317. ** (1) To avoid a dependency on qsort()
  318. ** (2) To avoid the function call to the comparison routine for each
  319. ** comparison.
  320. */
  321. static void percentSort(double *a, unsigned int n){
  322. int iLt; /* Entries before a[iLt] are less than rPivot */
  323. int iGt; /* Entries at or after a[iGt] are greater than rPivot */
  324. int i; /* Loop counter */
  325. double rPivot; /* The pivot value */
  326. assert( n>=2 );
  327. if( a[0]>a[n-1] ){
  328. SWAP_DOUBLE(a[0],a[n-1])
  329. }
  330. if( n==2 ) return;
  331. iGt = n-1;
  332. i = n/2;
  333. if( a[0]>a[i] ){
  334. SWAP_DOUBLE(a[0],a[i])
  335. }else if( a[i]>a[iGt] ){
  336. SWAP_DOUBLE(a[i],a[iGt])
  337. }
  338. if( n==3 ) return;
  339. rPivot = a[i];
  340. iLt = i = 1;
  341. do{
  342. if( a[i]<rPivot ){
  343. if( i>iLt ) SWAP_DOUBLE(a[i],a[iLt])
  344. iLt++;
  345. i++;
  346. }else if( a[i]>rPivot ){
  347. do{
  348. iGt--;
  349. }while( iGt>i && a[iGt]>rPivot );
  350. SWAP_DOUBLE(a[i],a[iGt])
  351. }else{
  352. i++;
  353. }
  354. }while( i<iGt );
  355. if( iLt>=2 ) percentSort(a, iLt);
  356. if( n-iGt>=2 ) percentSort(a+iGt, n-iGt);
  357. /* Uncomment for testing */
  358. #if 0
  359. for(i=0; i<n-1; i++){
  360. assert( a[i]<=a[i+1] );
  361. }
  362. #endif
  363. }
  364. /*
  365. ** The "inverse" function for percentile(Y,P) is called to remove a
  366. ** row that was previously inserted by "step".
  367. */
  368. static void percentInverse(sqlite3_context *pCtx,int argc,sqlite3_value **argv){
  369. Percentile *p;
  370. int eType;
  371. double y;
  372. int i;
  373. assert( argc==2 || argc==1 );
  374. /* Allocate the session context. */
  375. p = (Percentile*)sqlite3_aggregate_context(pCtx, sizeof(*p));
  376. assert( p!=0 );
  377. /* Ignore rows for which Y is NULL */
  378. eType = sqlite3_value_type(argv[0]);
  379. if( eType==SQLITE_NULL ) return;
  380. /* If not NULL, then Y must be numeric. Otherwise throw an error.
  381. ** Requirement 4 */
  382. if( eType!=SQLITE_INTEGER && eType!=SQLITE_FLOAT ){
  383. return;
  384. }
  385. /* Ignore the Y value if it is infinity or NaN */
  386. y = sqlite3_value_double(argv[0]);
  387. if( percentIsInfinity(y) ){
  388. return;
  389. }
  390. if( p->bSorted==0 ){
  391. assert( p->nUsed>1 );
  392. percentSort(p->a, p->nUsed);
  393. p->bSorted = 1;
  394. }
  395. p->bKeepSorted = 1;
  396. /* Find and remove the row */
  397. i = percentBinarySearch(p, y, 1);
  398. if( i>=0 ){
  399. p->nUsed--;
  400. if( i<(int)p->nUsed ){
  401. memmove(&p->a[i], &p->a[i+1], (p->nUsed - i)*sizeof(p->a[0]));
  402. }
  403. }
  404. }
  405. /*
  406. ** Compute the final output of percentile(). Clean up all allocated
  407. ** memory if and only if bIsFinal is true.
  408. */
  409. static void percentCompute(sqlite3_context *pCtx, int bIsFinal){
  410. Percentile *p;
  411. PercentileFunc *pFunc = (PercentileFunc*)sqlite3_user_data(pCtx);
  412. unsigned i1, i2;
  413. double v1, v2;
  414. double ix, vx;
  415. p = (Percentile*)sqlite3_aggregate_context(pCtx, 0);
  416. if( p==0 ) return;
  417. if( p->a==0 ) return;
  418. if( p->nUsed ){
  419. if( p->bSorted==0 ){
  420. assert( p->nUsed>1 );
  421. percentSort(p->a, p->nUsed);
  422. p->bSorted = 1;
  423. }
  424. ix = p->rPct*(p->nUsed-1);
  425. i1 = (unsigned)ix;
  426. if( pFunc->bDiscrete ){
  427. vx = p->a[i1];
  428. }else{
  429. i2 = ix==(double)i1 || i1==p->nUsed-1 ? i1 : i1+1;
  430. v1 = p->a[i1];
  431. v2 = p->a[i2];
  432. vx = v1 + (v2-v1)*(ix-i1);
  433. }
  434. sqlite3_result_double(pCtx, vx);
  435. }
  436. if( bIsFinal ){
  437. sqlite3_free(p->a);
  438. memset(p, 0, sizeof(*p));
  439. }else{
  440. p->bKeepSorted = 1;
  441. }
  442. }
  443. static void percentFinal(sqlite3_context *pCtx){
  444. percentCompute(pCtx, 1);
  445. }
  446. static void percentValue(sqlite3_context *pCtx){
  447. percentCompute(pCtx, 0);
  448. }
  449. #if defined(_WIN32) && !defined(SQLITE3_H) && !defined(SQLITE_STATIC_PERCENTILE)
  450. __declspec(dllexport)
  451. #endif
  452. int sqlite3_percentile_init(
  453. sqlite3 *db,
  454. char **pzErrMsg,
  455. const sqlite3_api_routines *pApi
  456. ){
  457. int rc = SQLITE_OK;
  458. unsigned int i;
  459. #ifdef SQLITE3EXT_H
  460. SQLITE_EXTENSION_INIT2(pApi);
  461. #else
  462. (void)pApi; /* Unused parameter */
  463. #endif
  464. (void)pzErrMsg; /* Unused parameter */
  465. for(i=0; i<sizeof(aPercentFunc)/sizeof(aPercentFunc[0]); i++){
  466. rc = sqlite3_create_window_function(db,
  467. aPercentFunc[i].zName,
  468. aPercentFunc[i].nArg,
  469. SQLITE_UTF8|SQLITE_INNOCUOUS|SQLITE_SELFORDER1,
  470. (void*)&aPercentFunc[i],
  471. percentStep, percentFinal, percentValue, percentInverse, 0);
  472. if( rc ) break;
  473. }
  474. return rc;
  475. }