latticepare.c 16 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568
  1. /********************************************************************
  2. * *
  3. * THIS FILE IS PART OF THE Ogg Vorbis SOFTWARE CODEC SOURCE CODE. *
  4. * USE, DISTRIBUTION AND REPRODUCTION OF THIS SOURCE IS GOVERNED BY *
  5. * THE GNU PUBLIC LICENSE 2, WHICH IS INCLUDED WITH THIS SOURCE. *
  6. * PLEASE READ THESE TERMS DISTRIBUTING. *
  7. * *
  8. * THE OggSQUISH SOURCE CODE IS (C) COPYRIGHT 1994-2000 *
  9. * by Monty <monty@xiph.org> and The XIPHOPHORUS Company *
  10. * http://www.xiph.org/ *
  11. * *
  12. ********************************************************************
  13. function: utility for paring low hit count cells from lattice codebook
  14. last mod: $Id: latticepare.c,v 1.1.2.3 2000/05/08 08:25:43 xiphmont Exp $
  15. ********************************************************************/
  16. #include <stdlib.h>
  17. #include <stdio.h>
  18. #include <math.h>
  19. #include <string.h>
  20. #include <errno.h>
  21. #include "vorbis/codebook.h"
  22. #include "../lib/sharedbook.h"
  23. #include "bookutil.h"
  24. #include "vqgen.h"
  25. #include "vqsplit.h"
  26. /* Lattice codebooks have two strengths: important fetaures that are
  27. poorly modelled by global error minimization training (eg, strong
  28. peaks) are not neglected 2) compact quantized representation.
  29. A fully populated lattice codebook, however, swings point 1 too far
  30. in the opposite direction; rare features need not be modelled quite
  31. so religiously and as such, we waste bits unless we eliminate the
  32. least common cells. The codebook rep supports unused cells, so we
  33. need to tag such cells and build an auxiliary (non-thresh) search
  34. mechanism to find the proper match quickly */
  35. /* two basic steps; first is pare the cell for which dispersal creates
  36. the least additional error. This will naturally choose
  37. low-population cells and cells that have not taken on points from
  38. neighboring paring (but does not result in the lattice collapsing
  39. inward and leaving low population ares totally unmodelled). After
  40. paring has removed the desired number of cells, we need to build an
  41. auxiliary search for each culled point */
  42. /* Although lattice books (due to threshhold-based matching) do not
  43. actually use error to make cell selections (in fact, it need not
  44. bear any relation), the 'secondbest' entry finder here is in fact
  45. error/distance based, so latticepare is only useful on such books */
  46. /* command line:
  47. latticepare latticebook.vqh input_data.vqd <target_cells>
  48. produces a new output book on stdout
  49. */
  50. static double _dist(int el,double *a, double *b){
  51. int i;
  52. double acc=0.;
  53. for(i=0;i<el;i++){
  54. double val=(a[i]-b[i]);
  55. acc+=val*val;
  56. }
  57. return(acc);
  58. }
  59. static double *pointlist;
  60. static long points=0;
  61. void add_vector(codebook *b,double *vec,long n){
  62. int dim=b->dim,i,j;
  63. int step=n/dim;
  64. for(i=0;i<step;i++){
  65. for(j=i;j<n;j+=step){
  66. pointlist[points++]=vec[j];
  67. }
  68. }
  69. }
  70. /* search neighboring [non-culled] cells for lowest distance. Spiral
  71. out in the event culling is deep */
  72. static int secondbest(codebook *b,double *vec,int best){
  73. encode_aux_threshmatch *tt=b->c->thresh_tree;
  74. int dim=b->dim;
  75. int i,j,spiral=1;
  76. int *index=alloca(dim*sizeof(int)*2);
  77. int *mod=index+dim;
  78. int entry=best;
  79. double bestmetric=0;
  80. int bestentry=-1;
  81. /* decompose index */
  82. for(i=0;i<dim;i++){
  83. index[i]=best%tt->quantvals;
  84. best/=tt->quantvals;
  85. }
  86. /* hit one off on all sides of it; most likely we'll find a possible
  87. match */
  88. /* suboptimal for unaligned entries */
  89. #if 0
  90. for(i=0;i<dim;i++){
  91. /* one up */
  92. if(index[i]+1<tt->quantvals){
  93. int newentry=entry+rint(pow(tt->quantvals,i));
  94. if(b->c->lengthlist[newentry]>0){
  95. double thismetric=_dist(dim, vec, b->valuelist+newentry*dim);
  96. if(bestentry==-1 || bestmetric>thismetric){
  97. bestmetric=thismetric;
  98. bestentry=newentry;
  99. }
  100. }
  101. }
  102. /* one down */
  103. if(index[i]-1>=0){
  104. int newentry=entry-rint(pow(tt->quantvals,i));
  105. if(b->c->lengthlist[newentry]>0){
  106. double thismetric=_dist(dim, vec, b->valuelist+newentry*dim);
  107. if(bestentry==-1 || bestmetric>thismetric){
  108. bestmetric=thismetric;
  109. bestentry=newentry;
  110. }
  111. }
  112. }
  113. }
  114. #endif
  115. /* no match? search all cells, binary count, that are one away on
  116. one or more axes. Then continue out until there's a match.
  117. We'll find one eventually, it's relatively OK to be inefficient
  118. as the attempt above will almost always succeed */
  119. while(bestentry==-1){
  120. for(i=0;i<dim;i++)mod[i]= -spiral;
  121. while(1){
  122. int newentry=entry;
  123. /* update the mod */
  124. for(j=0;j<dim;j++){
  125. if(mod[j]<=spiral)
  126. break;
  127. else{
  128. if(j+1<dim){
  129. mod[j]= -spiral;
  130. mod[j+1]++;
  131. }
  132. }
  133. }
  134. if(j==dim)break;
  135. /* reconstitute the entry */
  136. for(j=0;j<dim;j++){
  137. if(index[j]+mod[j]<0 || index[j]+mod[j]>=tt->quantvals){
  138. newentry=-1;
  139. break;
  140. }
  141. newentry+=mod[j]*rint(pow(tt->quantvals,j));
  142. }
  143. if(newentry!=-1 && newentry!=entry)
  144. if(b->c->lengthlist[newentry]>0){
  145. double thismetric=_dist(dim, vec, b->valuelist+newentry*dim);
  146. if(bestentry==-1 || bestmetric>thismetric){
  147. bestmetric=thismetric;
  148. bestentry=newentry;
  149. }
  150. }
  151. mod[0]++;
  152. }
  153. spiral++;
  154. }
  155. return(bestentry);
  156. }
  157. void usage(void){
  158. fprintf(stderr,"Ogg/Vorbis lattice codebook paring utility\n\n"
  159. "usage: latticepare book.vqh data.vqd <target_cells>\n"
  160. " -<n_0,n_1,...> [-<n_0,n_1,...>]\n\n"
  161. "where <target_cells> is the desired number of final cells (or -1\n"
  162. "for no change) and n,n,n,n...n are explicit entries to cull\n\n"
  163. "produces new book on stdout\n\n");
  164. exit(1);
  165. }
  166. int main(int argc,char *argv[]){
  167. char *basename;
  168. codebook *b=NULL;
  169. int entries=0;
  170. int dim=0;
  171. long i,j,target=-1;
  172. int *cvec=NULL;
  173. long *cullist=malloc(sizeof(int));
  174. long culls=0;
  175. argv++;
  176. if(*argv==NULL){
  177. usage();
  178. exit(1);
  179. }
  180. /* yes, this is evil. However, it's very convenient to parse file
  181. extentions */
  182. while(*argv){
  183. if(*argv[0]=='-'){
  184. char *ptr=argv[0];
  185. long index=0;
  186. /* explicit cull */
  187. if(!b)usage();
  188. if(!cvec)cvec=malloc(dim*sizeof(int)); /* lazy ;-) */
  189. for(i=0;i<dim;i++){
  190. if(!ptr){
  191. fprintf(stderr,"too few values in cull argument %s\n",argv[0]);
  192. exit(1);
  193. }
  194. cvec[i]=atoi(ptr+1);
  195. if(cvec[i]<0 || cvec[i]>=b->c->thresh_tree->quantvals){
  196. fprintf(stderr,"value too large in cull argument %s\n",argv[0]);
  197. exit(1);
  198. }
  199. ptr=strchr(ptr+1,',');
  200. }
  201. if(ptr){
  202. fprintf(stderr,"too many values in cull argument %s\n",argv[0]);
  203. exit(1);
  204. }
  205. for(i=dim;i>0;i--)
  206. index=index*b->c->thresh_tree->quantvals+cvec[i-1];
  207. cullist=realloc(cullist,++culls*sizeof(long));
  208. cullist[culls-1]=index;
  209. fprintf(stderr,"\rExplicitly culling index %ld\n",index);
  210. argv++;
  211. }else{
  212. /* input file. What kind? */
  213. char *dot;
  214. char *ext=NULL;
  215. char *name=strdup(*argv++);
  216. dot=strrchr(name,'.');
  217. if(dot)
  218. ext=dot+1;
  219. else{
  220. ext="";
  221. target=atol(name);
  222. if(target==0)target=entries;
  223. }
  224. /* codebook */
  225. if(!strcmp(ext,"vqh")){
  226. basename=strrchr(name,'/');
  227. if(basename)
  228. basename=strdup(basename)+1;
  229. else
  230. basename=strdup(name);
  231. dot=strrchr(basename,'.');
  232. if(dot)*dot='\0';
  233. b=codebook_load(name);
  234. dim=b->dim;
  235. entries=b->entries;
  236. }
  237. /* data file; we do actually need to suck it into memory */
  238. /* we're dealing with just one book, so we can de-interleave */
  239. if(!strcmp(ext,"vqd") && !points){
  240. int cols;
  241. long lines=0;
  242. char *line;
  243. double *vec;
  244. FILE *in=fopen(name,"r");
  245. if(!in){
  246. fprintf(stderr,"Could not open input file %s\n",name);
  247. exit(1);
  248. }
  249. reset_next_value();
  250. line=setup_line(in);
  251. /* count cols before we start reading */
  252. {
  253. char *temp=line;
  254. while(*temp==' ')temp++;
  255. for(cols=0;*temp;cols++){
  256. while(*temp>32)temp++;
  257. while(*temp==' ')temp++;
  258. }
  259. }
  260. vec=alloca(cols*sizeof(double));
  261. /* count, then load, to avoid fragmenting the hell out of
  262. memory */
  263. while(line){
  264. lines++;
  265. for(j=0;j<cols;j++)
  266. if(get_line_value(in,vec+j)){
  267. fprintf(stderr,"Too few columns on line %ld in data file\n",lines);
  268. exit(1);
  269. }
  270. if((lines&0xff)==0)spinnit("counting samples...",lines*cols);
  271. line=setup_line(in);
  272. }
  273. pointlist=malloc(cols*lines*sizeof(double));
  274. rewind(in);
  275. line=setup_line(in);
  276. while(line){
  277. lines--;
  278. for(j=0;j<cols;j++)
  279. if(get_line_value(in,vec+j)){
  280. fprintf(stderr,"Too few columns on line %ld in data file\n",lines);
  281. exit(1);
  282. }
  283. /* deinterleave, add to heap */
  284. add_vector(b,vec,cols);
  285. if((lines&0xff)==0)spinnit("loading samples...",lines*cols);
  286. line=setup_line(in);
  287. }
  288. fclose(in);
  289. }
  290. }
  291. }
  292. if(!entries || !points)usage();
  293. if(target==-1)usage();
  294. points/=dim;
  295. /* set up auxiliary vectors for error tracking */
  296. {
  297. encode_aux_nearestmatch *nt=NULL;
  298. long pointssofar=0;
  299. long *pointindex;
  300. long indexedpoints=0;
  301. long *entryindex;
  302. long *reventry;
  303. long *membership=malloc(points*sizeof(long));
  304. long *cellhead=malloc(entries*sizeof(long));
  305. long *cellcount=calloc(entries,sizeof(long));
  306. double *cellerror1=calloc(entries,sizeof(double)); /* error for
  307. firstentries */
  308. double *cellerror2=calloc(entries,sizeof(double)); /* error for
  309. secondentry */
  310. double globalerror=0.;
  311. long cellsleft=entries;
  312. for(i=0;i<points;i++)membership[i]=-1;
  313. for(i=0;i<entries;i++)cellhead[i]=-1;
  314. for(i=0;i<points;i++){
  315. /* assign vectors to the nearest cell. Also keep track of second
  316. nearest for error statistics */
  317. double *ppt=pointlist+i*dim;
  318. int firstentry=_best(b,ppt,1);
  319. int secondentry=secondbest(b,ppt,firstentry);
  320. double firstmetric=_dist(dim,b->valuelist+dim*firstentry,ppt);
  321. double secondmetric=_dist(dim,b->valuelist+dim*secondentry,ppt);
  322. if(!(i&0xff))spinnit("initializing... ",points-i);
  323. membership[i]=cellhead[firstentry];
  324. cellhead[firstentry]=i;
  325. cellerror1[firstentry]+=firstmetric;
  326. cellcount[firstentry]++;
  327. globalerror+=firstmetric;
  328. cellerror2[firstentry]+=secondmetric;
  329. }
  330. /* handle the explicit cull list */
  331. for(i=0;i<culls;i++){
  332. long bestcell=cullist[i];
  333. char buf[80];
  334. sprintf(buf,"explicit culls (%d left)... ",(int)culls-i);
  335. /* disperse cell. move each point out, adding it (properly) to
  336. the second best */
  337. if(b->c->lengthlist[bestcell]>0){
  338. long head=cellhead[bestcell];
  339. b->c->lengthlist[bestcell]=0;
  340. cellhead[bestcell]=-1;
  341. while(head!=-1){
  342. /* head is a point number */
  343. double *ppt=pointlist+head*dim;
  344. int newentry=secondbest(b,ppt,bestcell);
  345. int secondentry=secondbest(b,pointlist+head*dim,newentry);
  346. double firstmetric=_dist(dim,b->valuelist+dim*newentry,ppt);
  347. double secondmetric=_dist(dim,b->valuelist+dim*secondentry,ppt);
  348. long next=membership[head];
  349. cellcount[newentry]++;
  350. cellcount[bestcell]--;
  351. cellerror1[newentry]+=firstmetric;
  352. cellerror2[newentry]+=secondmetric;
  353. spinnit(buf,cellcount[bestcell]);
  354. membership[head]=cellhead[newentry];
  355. cellhead[newentry]=head;
  356. head=next;
  357. }
  358. cellsleft--;
  359. }
  360. }
  361. /* do the automatic cull request */
  362. while(cellsleft>target){
  363. int bestcell=-1;
  364. double besterror=0;
  365. long head=-1;
  366. spinnit("cells left to eliminate... ",cellsleft-target);
  367. /* find the cell with lowest removal impact */
  368. for(i=0;i<entries;i++){
  369. if(b->c->lengthlist[i]>0){
  370. double thiserror=globalerror-cellerror1[i]+cellerror2[i];
  371. if(bestcell==-1 || besterror>thiserror){
  372. besterror=thiserror;
  373. bestcell=i;
  374. }
  375. }
  376. }
  377. /* disperse it. move each point out, adding it (properly) to
  378. the second best */
  379. b->c->lengthlist[bestcell]=0;
  380. head=cellhead[bestcell];
  381. cellhead[bestcell]=-1;
  382. while(head!=-1){
  383. /* head is a point number */
  384. double *ppt=pointlist+head*dim;
  385. int newentry=secondbest(b,ppt,bestcell);
  386. int secondentry=secondbest(b,pointlist+head*dim,newentry);
  387. double firstmetric=_dist(dim,b->valuelist+dim*newentry,ppt);
  388. double secondmetric=_dist(dim,b->valuelist+dim*secondentry,ppt);
  389. long next=membership[head];
  390. cellerror1[newentry]+=firstmetric;
  391. cellerror2[newentry]+=secondmetric;
  392. membership[head]=cellhead[newentry];
  393. cellhead[newentry]=head;
  394. head=next;
  395. }
  396. cellsleft--;
  397. }
  398. /* paring is over. Build decision trees using points that now fall
  399. through the thresh matcher. */
  400. /* we don't free membership; we flatten it in order to use in lp_split */
  401. for(i=0;i<entries;i++){
  402. long head=cellhead[i];
  403. spinnit("rearranging membership cache... ",entries-i);
  404. while(head!=-1){
  405. long next=membership[head];
  406. membership[head]=i;
  407. head=next;
  408. }
  409. }
  410. free(cellhead);
  411. free(cellerror1);
  412. free(cellerror2);
  413. pointindex=malloc(points*sizeof(long));
  414. /* make a point index of fall-through points */
  415. for(i=0;i<points;i++){
  416. int best=_best(b,pointlist+i*dim,1);
  417. if(best==-1)
  418. pointindex[indexedpoints++]=i;
  419. spinnit("finding orphaned points... ",points-i);
  420. }
  421. /* make an entry index */
  422. entryindex=malloc(entries*sizeof(long));
  423. target=0;
  424. for(i=0;i<entries;i++){
  425. if(b->c->lengthlist[i]>0)
  426. entryindex[target++]=i;
  427. }
  428. /* make working space for a reverse entry index */
  429. reventry=malloc(entries*sizeof(long));
  430. /* do the split */
  431. nt=b->c->nearest_tree=
  432. calloc(1,sizeof(encode_aux_nearestmatch));
  433. nt->alloc=4096;
  434. nt->ptr0=malloc(sizeof(long)*nt->alloc);
  435. nt->ptr1=malloc(sizeof(long)*nt->alloc);
  436. nt->p=malloc(sizeof(long)*nt->alloc);
  437. nt->q=malloc(sizeof(long)*nt->alloc);
  438. nt->aux=0;
  439. fprintf(stderr,"Leaves added: %d \n",
  440. lp_split(pointlist,points,
  441. b,entryindex,target,
  442. pointindex,indexedpoints,
  443. membership,reventry,
  444. 0,&pointssofar));
  445. free(membership);
  446. free(reventry);
  447. free(pointindex);
  448. /* recount hits. Build new lengthlist. reuse entryindex storage */
  449. for(i=0;i<entries;i++)entryindex[i]=1;
  450. for(i=0;i<points;i++){
  451. int best=_best(b,pointlist+i*dim,1);
  452. if(!(i&0xff))spinnit("counting hits...",i);
  453. if(best==-1){
  454. fprintf(stderr,"\nINTERNAL ERROR; a point count not be matched to a\n"
  455. "codebook entry. The new decision tree is broken.\n");
  456. exit(1);
  457. }
  458. entryindex[best]++;
  459. }
  460. /* the lengthlist builder doesn't actually deal with 0 hit entries.
  461. So, we pack the 'sparse' hit list into a dense list, then unpack
  462. the lengths after the build */
  463. {
  464. int upper=0;
  465. long *lengthlist=calloc(entries,sizeof(long));
  466. for(i=0;i<entries;i++)
  467. if(b->c->lengthlist[i]>0)
  468. entryindex[upper++]=entryindex[i];
  469. /* sanity check */
  470. if(upper != target){
  471. fprintf(stderr,"\nINTERNAL ERROR; packed the wrong number of entries\n");
  472. exit(1);
  473. }
  474. build_tree_from_lengths(upper,entryindex,lengthlist);
  475. upper=0;
  476. for(i=0;i<entries;i++)
  477. if(b->c->lengthlist[i]>0)
  478. b->c->lengthlist[i]=lengthlist[upper++];
  479. }
  480. }
  481. /* we're done. write it out. */
  482. write_codebook(stdout,"foo",b->c);
  483. fprintf(stderr,"\r \nDone.\n");
  484. return(0);
  485. }