123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504 |
- /*
- ** 2013-05-28
- **
- ** The author disclaims copyright to this source code. In place of
- ** a legal notice, here is a blessing:
- **
- ** May you do good and not evil.
- ** May you find forgiveness for yourself and forgive others.
- ** May you share freely, never taking more than you give.
- **
- ******************************************************************************
- **
- ** This file contains code to implement the percentile(Y,P) SQL function
- ** and similar as described below:
- **
- ** (1) The percentile(Y,P) function is an aggregate function taking
- ** exactly two arguments.
- **
- ** (2) If the P argument to percentile(Y,P) is not the same for every
- ** row in the aggregate then an error is thrown. The word "same"
- ** in the previous sentence means that the value differ by less
- ** than 0.001.
- **
- ** (3) If the P argument to percentile(Y,P) evaluates to anything other
- ** than a number in the range of 0.0 to 100.0 inclusive then an
- ** error is thrown.
- **
- ** (4) If any Y argument to percentile(Y,P) evaluates to a value that
- ** is not NULL and is not numeric then an error is thrown.
- **
- ** (5) If any Y argument to percentile(Y,P) evaluates to plus or minus
- ** infinity then an error is thrown. (SQLite always interprets NaN
- ** values as NULL.)
- **
- ** (6) Both Y and P in percentile(Y,P) can be arbitrary expressions,
- ** including CASE WHEN expressions.
- **
- ** (7) The percentile(Y,P) aggregate is able to handle inputs of at least
- ** one million (1,000,000) rows.
- **
- ** (8) If there are no non-NULL values for Y, then percentile(Y,P)
- ** returns NULL.
- **
- ** (9) If there is exactly one non-NULL value for Y, the percentile(Y,P)
- ** returns the one Y value.
- **
- ** (10) If there N non-NULL values of Y where N is two or more and
- ** the Y values are ordered from least to greatest and a graph is
- ** drawn from 0 to N-1 such that the height of the graph at J is
- ** the J-th Y value and such that straight lines are drawn between
- ** adjacent Y values, then the percentile(Y,P) function returns
- ** the height of the graph at P*(N-1)/100.
- **
- ** (11) The percentile(Y,P) function always returns either a floating
- ** point number or NULL.
- **
- ** (12) The percentile(Y,P) is implemented as a single C99 source-code
- ** file that compiles into a shared-library or DLL that can be loaded
- ** into SQLite using the sqlite3_load_extension() interface.
- **
- ** (13) A separate median(Y) function is the equivalent percentile(Y,50).
- **
- ** (14) A separate percentile_cont(Y,P) function is equivalent to
- ** percentile(Y,P/100.0). In other words, the fraction value in
- ** the second argument is in the range of 0 to 1 instead of 0 to 100.
- **
- ** (15) A separate percentile_disc(Y,P) function is like
- ** percentile_cont(Y,P) except that instead of returning the weighted
- ** average of the nearest two input values, it returns the next lower
- ** value. So the percentile_disc(Y,P) will always return a value
- ** that was one of the inputs.
- **
- ** (16) All of median(), percentile(Y,P), percentile_cont(Y,P) and
- ** percentile_disc(Y,P) can be used as window functions.
- **
- ** Differences from standard SQL:
- **
- ** * The percentile_cont(X,P) function is equivalent to the following in
- ** standard SQL:
- **
- ** (percentile_cont(P) WITHIN GROUP (ORDER BY X))
- **
- ** The SQLite syntax is much more compact. The standard SQL syntax
- ** is also supported if SQLite is compiled with the
- ** -DSQLITE_ENABLE_ORDERED_SET_AGGREGATES option.
- **
- ** * No median(X) function exists in the SQL standard. App developers
- ** are expected to write "percentile_cont(0.5)WITHIN GROUP(ORDER BY X)".
- **
- ** * No percentile(Y,P) function exists in the SQL standard. Instead of
- ** percential(Y,P), developers must write this:
- ** "percentile_cont(P/100.0) WITHIN GROUP (ORDER BY Y)". Note that
- ** the fraction parameter to percentile() goes from 0 to 100 whereas
- ** the fraction parameter in SQL standard percentile_cont() goes from
- ** 0 to 1.
- **
- ** Implementation notes as of 2024-08-31:
- **
- ** * The regular aggregate-function versions of these routines work
- ** by accumulating all values in an array of doubles, then sorting
- ** that array using quicksort before computing the answer. Thus
- ** the runtime is O(NlogN) where N is the number of rows of input.
- **
- ** * For the window-function versions of these routines, the array of
- ** inputs is sorted as soon as the first value is computed. Thereafter,
- ** the array is kept in sorted order using an insert-sort. This
- ** results in O(N*K) performance where K is the size of the window.
- ** One can imagine alternative implementations that give O(N*logN*logK)
- ** performance, but they require more complex logic and data structures.
- ** The developers have elected to keep the asymptotically slower
- ** algorithm for now, for simplicity, under the theory that window
- ** functions are seldom used and when they are, the window size K is
- ** often small. The developers might revisit that decision later,
- ** should the need arise.
- */
- #if defined(SQLITE3_H)
- /* no-op */
- #elif defined(SQLITE_STATIC_PERCENTILE)
- # include "sqlite3.h"
- #else
- # include "sqlite3ext.h"
- SQLITE_EXTENSION_INIT1
- #endif
- #include <assert.h>
- #include <string.h>
- #include <stdlib.h>
- /* The following object is the group context for a single percentile()
- ** aggregate. Remember all input Y values until the very end.
- ** Those values are accumulated in the Percentile.a[] array.
- */
- typedef struct Percentile Percentile;
- struct Percentile {
- unsigned nAlloc; /* Number of slots allocated for a[] */
- unsigned nUsed; /* Number of slots actually used in a[] */
- char bSorted; /* True if a[] is already in sorted order */
- char bKeepSorted; /* True if advantageous to keep a[] sorted */
- char bPctValid; /* True if rPct is valid */
- double rPct; /* Fraction. 0.0 to 1.0 */
- double *a; /* Array of Y values */
- };
- /* Details of each function in the percentile family */
- typedef struct PercentileFunc PercentileFunc;
- struct PercentileFunc {
- const char *zName; /* Function name */
- char nArg; /* Number of arguments */
- char mxFrac; /* Maximum value of the "fraction" input */
- char bDiscrete; /* True for percentile_disc() */
- };
- static const PercentileFunc aPercentFunc[] = {
- { "median", 1, 1, 0 },
- { "percentile", 2, 100, 0 },
- { "percentile_cont", 2, 1, 0 },
- { "percentile_disc", 2, 1, 1 },
- };
- /*
- ** Return TRUE if the input floating-point number is an infinity.
- */
- static int percentIsInfinity(double r){
- sqlite3_uint64 u;
- assert( sizeof(u)==sizeof(r) );
- memcpy(&u, &r, sizeof(u));
- return ((u>>52)&0x7ff)==0x7ff;
- }
- /*
- ** Return TRUE if two doubles differ by 0.001 or less.
- */
- static int percentSameValue(double a, double b){
- a -= b;
- return a>=-0.001 && a<=0.001;
- }
- /*
- ** Search p (which must have p->bSorted) looking for an entry with
- ** value y. Return the index of that entry.
- **
- ** If bExact is true, return -1 if the entry is not found.
- **
- ** If bExact is false, return the index at which a new entry with
- ** value y should be insert in order to keep the values in sorted
- ** order. The smallest return value in this case will be 0, and
- ** the largest return value will be p->nUsed.
- */
- static int percentBinarySearch(Percentile *p, double y, int bExact){
- int iFirst = 0; /* First element of search range */
- int iLast = p->nUsed - 1; /* Last element of search range */
- while( iLast>=iFirst ){
- int iMid = (iFirst+iLast)/2;
- double x = p->a[iMid];
- if( x<y ){
- iFirst = iMid + 1;
- }else if( x>y ){
- iLast = iMid - 1;
- }else{
- return iMid;
- }
- }
- if( bExact ) return -1;
- return iFirst;
- }
- /*
- ** Generate an error for a percentile function.
- **
- ** The error format string must have exactly one occurrance of "%%s()"
- ** (with two '%' characters). That substring will be replaced by the name
- ** of the function.
- */
- static void percentError(sqlite3_context *pCtx, const char *zFormat, ...){
- PercentileFunc *pFunc = (PercentileFunc*)sqlite3_user_data(pCtx);
- char *zMsg1;
- char *zMsg2;
- va_list ap;
- va_start(ap, zFormat);
- zMsg1 = sqlite3_vmprintf(zFormat, ap);
- va_end(ap);
- zMsg2 = zMsg1 ? sqlite3_mprintf(zMsg1, pFunc->zName) : 0;
- sqlite3_result_error(pCtx, zMsg2, -1);
- sqlite3_free(zMsg1);
- sqlite3_free(zMsg2);
- }
- /*
- ** The "step" function for percentile(Y,P) is called once for each
- ** input row.
- */
- static void percentStep(sqlite3_context *pCtx, int argc, sqlite3_value **argv){
- Percentile *p;
- double rPct;
- int eType;
- double y;
- assert( argc==2 || argc==1 );
- if( argc==1 ){
- /* Requirement 13: median(Y) is the same as percentile(Y,50). */
- rPct = 0.5;
- }else{
- /* Requirement 3: P must be a number between 0 and 100 */
- PercentileFunc *pFunc = (PercentileFunc*)sqlite3_user_data(pCtx);
- eType = sqlite3_value_numeric_type(argv[1]);
- rPct = sqlite3_value_double(argv[1])/(double)pFunc->mxFrac;
- if( (eType!=SQLITE_INTEGER && eType!=SQLITE_FLOAT)
- || rPct<0.0 || rPct>1.0
- ){
- percentError(pCtx, "the fraction argument to %%s()"
- " is not between 0.0 and %.1f",
- (double)pFunc->mxFrac);
- return;
- }
- }
- /* Allocate the session context. */
- p = (Percentile*)sqlite3_aggregate_context(pCtx, sizeof(*p));
- if( p==0 ) return;
- /* Remember the P value. Throw an error if the P value is different
- ** from any prior row, per Requirement (2). */
- if( !p->bPctValid ){
- p->rPct = rPct;
- p->bPctValid = 1;
- }else if( !percentSameValue(p->rPct,rPct) ){
- percentError(pCtx, "the fraction argument to %%s()"
- " is not the same for all input rows");
- return;
- }
- /* Ignore rows for which Y is NULL */
- eType = sqlite3_value_type(argv[0]);
- if( eType==SQLITE_NULL ) return;
- /* If not NULL, then Y must be numeric. Otherwise throw an error.
- ** Requirement 4 */
- if( eType!=SQLITE_INTEGER && eType!=SQLITE_FLOAT ){
- percentError(pCtx, "input to %%s() is not numeric");
- return;
- }
- /* Throw an error if the Y value is infinity or NaN */
- y = sqlite3_value_double(argv[0]);
- if( percentIsInfinity(y) ){
- percentError(pCtx, "Inf input to %%s()");
- return;
- }
- /* Allocate and store the Y */
- if( p->nUsed>=p->nAlloc ){
- unsigned n = p->nAlloc*2 + 250;
- double *a = sqlite3_realloc64(p->a, sizeof(double)*n);
- if( a==0 ){
- sqlite3_free(p->a);
- memset(p, 0, sizeof(*p));
- sqlite3_result_error_nomem(pCtx);
- return;
- }
- p->nAlloc = n;
- p->a = a;
- }
- if( p->nUsed==0 ){
- p->a[p->nUsed++] = y;
- p->bSorted = 1;
- }else if( !p->bSorted || y>=p->a[p->nUsed-1] ){
- p->a[p->nUsed++] = y;
- }else if( p->bKeepSorted ){
- int i;
- i = percentBinarySearch(p, y, 0);
- if( i<(int)p->nUsed ){
- memmove(&p->a[i+1], &p->a[i], (p->nUsed-i)*sizeof(p->a[0]));
- }
- p->a[i] = y;
- p->nUsed++;
- }else{
- p->a[p->nUsed++] = y;
- p->bSorted = 0;
- }
- }
- /*
- ** Interchange two doubles.
- */
- #define SWAP_DOUBLE(X,Y) {double ttt=(X);(X)=(Y);(Y)=ttt;}
- /*
- ** Sort an array of doubles.
- **
- ** Algorithm: quicksort
- **
- ** This is implemented separately rather than using the qsort() routine
- ** from the standard library because:
- **
- ** (1) To avoid a dependency on qsort()
- ** (2) To avoid the function call to the comparison routine for each
- ** comparison.
- */
- static void percentSort(double *a, unsigned int n){
- int iLt; /* Entries before a[iLt] are less than rPivot */
- int iGt; /* Entries at or after a[iGt] are greater than rPivot */
- int i; /* Loop counter */
- double rPivot; /* The pivot value */
-
- assert( n>=2 );
- if( a[0]>a[n-1] ){
- SWAP_DOUBLE(a[0],a[n-1])
- }
- if( n==2 ) return;
- iGt = n-1;
- i = n/2;
- if( a[0]>a[i] ){
- SWAP_DOUBLE(a[0],a[i])
- }else if( a[i]>a[iGt] ){
- SWAP_DOUBLE(a[i],a[iGt])
- }
- if( n==3 ) return;
- rPivot = a[i];
- iLt = i = 1;
- do{
- if( a[i]<rPivot ){
- if( i>iLt ) SWAP_DOUBLE(a[i],a[iLt])
- iLt++;
- i++;
- }else if( a[i]>rPivot ){
- do{
- iGt--;
- }while( iGt>i && a[iGt]>rPivot );
- SWAP_DOUBLE(a[i],a[iGt])
- }else{
- i++;
- }
- }while( i<iGt );
- if( iLt>=2 ) percentSort(a, iLt);
- if( n-iGt>=2 ) percentSort(a+iGt, n-iGt);
-
- /* Uncomment for testing */
- #if 0
- for(i=0; i<n-1; i++){
- assert( a[i]<=a[i+1] );
- }
- #endif
- }
- /*
- ** The "inverse" function for percentile(Y,P) is called to remove a
- ** row that was previously inserted by "step".
- */
- static void percentInverse(sqlite3_context *pCtx,int argc,sqlite3_value **argv){
- Percentile *p;
- int eType;
- double y;
- int i;
- assert( argc==2 || argc==1 );
- /* Allocate the session context. */
- p = (Percentile*)sqlite3_aggregate_context(pCtx, sizeof(*p));
- assert( p!=0 );
- /* Ignore rows for which Y is NULL */
- eType = sqlite3_value_type(argv[0]);
- if( eType==SQLITE_NULL ) return;
- /* If not NULL, then Y must be numeric. Otherwise throw an error.
- ** Requirement 4 */
- if( eType!=SQLITE_INTEGER && eType!=SQLITE_FLOAT ){
- return;
- }
- /* Ignore the Y value if it is infinity or NaN */
- y = sqlite3_value_double(argv[0]);
- if( percentIsInfinity(y) ){
- return;
- }
- if( p->bSorted==0 ){
- assert( p->nUsed>1 );
- percentSort(p->a, p->nUsed);
- p->bSorted = 1;
- }
- p->bKeepSorted = 1;
- /* Find and remove the row */
- i = percentBinarySearch(p, y, 1);
- if( i>=0 ){
- p->nUsed--;
- if( i<(int)p->nUsed ){
- memmove(&p->a[i], &p->a[i+1], (p->nUsed - i)*sizeof(p->a[0]));
- }
- }
- }
- /*
- ** Compute the final output of percentile(). Clean up all allocated
- ** memory if and only if bIsFinal is true.
- */
- static void percentCompute(sqlite3_context *pCtx, int bIsFinal){
- Percentile *p;
- PercentileFunc *pFunc = (PercentileFunc*)sqlite3_user_data(pCtx);
- unsigned i1, i2;
- double v1, v2;
- double ix, vx;
- p = (Percentile*)sqlite3_aggregate_context(pCtx, 0);
- if( p==0 ) return;
- if( p->a==0 ) return;
- if( p->nUsed ){
- if( p->bSorted==0 ){
- assert( p->nUsed>1 );
- percentSort(p->a, p->nUsed);
- p->bSorted = 1;
- }
- ix = p->rPct*(p->nUsed-1);
- i1 = (unsigned)ix;
- if( pFunc->bDiscrete ){
- vx = p->a[i1];
- }else{
- i2 = ix==(double)i1 || i1==p->nUsed-1 ? i1 : i1+1;
- v1 = p->a[i1];
- v2 = p->a[i2];
- vx = v1 + (v2-v1)*(ix-i1);
- }
- sqlite3_result_double(pCtx, vx);
- }
- if( bIsFinal ){
- sqlite3_free(p->a);
- memset(p, 0, sizeof(*p));
- }else{
- p->bKeepSorted = 1;
- }
- }
- static void percentFinal(sqlite3_context *pCtx){
- percentCompute(pCtx, 1);
- }
- static void percentValue(sqlite3_context *pCtx){
- percentCompute(pCtx, 0);
- }
- #if defined(_WIN32) && !defined(SQLITE3_H) && !defined(SQLITE_STATIC_PERCENTILE)
- __declspec(dllexport)
- #endif
- int sqlite3_percentile_init(
- sqlite3 *db,
- char **pzErrMsg,
- const sqlite3_api_routines *pApi
- ){
- int rc = SQLITE_OK;
- unsigned int i;
- #ifdef SQLITE3EXT_H
- SQLITE_EXTENSION_INIT2(pApi);
- #else
- (void)pApi; /* Unused parameter */
- #endif
- (void)pzErrMsg; /* Unused parameter */
- for(i=0; i<sizeof(aPercentFunc)/sizeof(aPercentFunc[0]); i++){
- rc = sqlite3_create_window_function(db,
- aPercentFunc[i].zName,
- aPercentFunc[i].nArg,
- SQLITE_UTF8|SQLITE_INNOCUOUS|SQLITE_SELFORDER1,
- (void*)&aPercentFunc[i],
- percentStep, percentFinal, percentValue, percentInverse, 0);
- if( rc ) break;
- }
- return rc;
- }
|