text.c 83 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207120812091210121112121213121412151216121712181219122012211222122312241225122612271228122912301231123212331234123512361237123812391240124112421243124412451246124712481249125012511252125312541255125612571258125912601261126212631264126512661267126812691270127112721273127412751276127712781279128012811282128312841285128612871288128912901291129212931294129512961297129812991300130113021303130413051306130713081309131013111312131313141315131613171318131913201321132213231324132513261327132813291330133113321333133413351336133713381339134013411342134313441345134613471348134913501351135213531354135513561357135813591360136113621363136413651366136713681369137013711372137313741375137613771378137913801381138213831384138513861387138813891390139113921393139413951396139713981399140014011402140314041405140614071408140914101411141214131414141514161417141814191420142114221423142414251426142714281429143014311432143314341435143614371438143914401441144214431444144514461447144814491450145114521453145414551456145714581459146014611462146314641465146614671468146914701471147214731474147514761477147814791480148114821483148414851486148714881489149014911492149314941495149614971498149915001501150215031504150515061507150815091510151115121513151415151516151715181519152015211522152315241525152615271528152915301531153215331534153515361537153815391540154115421543154415451546154715481549155015511552155315541555155615571558155915601561156215631564156515661567156815691570157115721573157415751576157715781579158015811582158315841585158615871588158915901591159215931594159515961597159815991600160116021603160416051606160716081609161016111612161316141615161616171618161916201621162216231624162516261627162816291630163116321633163416351636163716381639164016411642164316441645164616471648164916501651165216531654165516561657165816591660166116621663166416651666166716681669167016711672167316741675167616771678167916801681168216831684168516861687168816891690169116921693169416951696169716981699170017011702170317041705170617071708170917101711171217131714171517161717171817191720172117221723172417251726172717281729173017311732173317341735173617371738173917401741174217431744174517461747174817491750175117521753175417551756175717581759176017611762176317641765176617671768176917701771177217731774177517761777177817791780178117821783178417851786178717881789179017911792179317941795179617971798179918001801180218031804180518061807180818091810181118121813181418151816181718181819182018211822182318241825182618271828182918301831183218331834183518361837183818391840184118421843184418451846184718481849185018511852185318541855185618571858185918601861186218631864186518661867186818691870187118721873187418751876187718781879188018811882188318841885188618871888188918901891189218931894189518961897189818991900190119021903190419051906190719081909191019111912191319141915191619171918191919201921192219231924192519261927192819291930193119321933193419351936193719381939194019411942194319441945194619471948194919501951195219531954195519561957195819591960196119621963196419651966196719681969197019711972197319741975197619771978197919801981198219831984198519861987198819891990199119921993199419951996199719981999200020012002200320042005200620072008200920102011201220132014201520162017201820192020202120222023202420252026202720282029203020312032203320342035203620372038203920402041204220432044204520462047204820492050205120522053205420552056205720582059206020612062206320642065206620672068206920702071207220732074207520762077207820792080208120822083208420852086208720882089209020912092209320942095209620972098209921002101210221032104210521062107210821092110211121122113211421152116211721182119212021212122212321242125212621272128212921302131213221332134213521362137213821392140214121422143214421452146214721482149215021512152215321542155215621572158215921602161216221632164216521662167216821692170217121722173217421752176217721782179218021812182218321842185218621872188218921902191219221932194219521962197219821992200220122022203220422052206220722082209221022112212221322142215221622172218221922202221222222232224222522262227222822292230223122322233223422352236223722382239224022412242224322442245224622472248224922502251225222532254225522562257225822592260226122622263226422652266226722682269227022712272227322742275227622772278227922802281228222832284228522862287228822892290229122922293229422952296229722982299230023012302230323042305230623072308230923102311
  1. /* ------------------------------------------------------------------------- */
  2. /* "text" : Text translation, the abbreviations optimiser, the dictionary */
  3. /* */
  4. /* Part of Inform 6.33 */
  5. /* copyright (c) Graham Nelson 1993 - 2014 */
  6. /* */
  7. /* ------------------------------------------------------------------------- */
  8. #include "header.h"
  9. uchar *low_strings, *low_strings_top; /* Start and next free byte in the low
  10. strings pool */
  11. int32 static_strings_extent; /* Number of bytes of static strings
  12. made so far */
  13. memory_block static_strings_area; /* Used if (!temporary_files_switch) to
  14. hold the static strings area so far */
  15. static uchar *strings_holding_area; /* Area holding translated strings
  16. until they are moved into either
  17. a temporary file, or the
  18. static_strings_area below */
  19. char *all_text, *all_text_top; /* Start and next byte free in (large)
  20. text buffer holding the entire text
  21. of the game, when it is being
  22. recorded */
  23. int put_strings_in_low_memory, /* When TRUE, put static strings in
  24. the low strings pool at 0x100 rather
  25. than in the static strings area */
  26. is_abbreviation, /* When TRUE, the string being trans
  27. is itself an abbreviation string
  28. so can't make use of abbreviations */
  29. abbrevs_lookup_table_made, /* The abbreviations lookup table is
  30. constructed when the first non-
  31. abbreviation string is translated:
  32. this flag is TRUE after that */
  33. abbrevs_lookup[256]; /* Once this has been constructed,
  34. abbrevs_lookup[n] = the smallest
  35. number of any abbreviation beginning
  36. with ASCII character n, or -1
  37. if none of the abbreviations do */
  38. int no_abbreviations; /* No of abbreviations defined so far */
  39. uchar *abbreviations_at; /* Memory to hold the text of any
  40. abbreviation strings declared */
  41. /* ------------------------------------------------------------------------- */
  42. /* Glulx string compression storage */
  43. /* ------------------------------------------------------------------------- */
  44. int no_strings; /* No of strings in static strings
  45. area. */
  46. int no_dynamic_strings; /* No. of @.. string escapes used
  47. (actually, the highest value used
  48. plus one) */
  49. int no_unicode_chars; /* Number of distinct Unicode chars
  50. used. (Beyond 0xFF.) */
  51. static int MAX_CHARACTER_SET; /* Number of possible entities */
  52. huffentity_t *huff_entities; /* The list of entities (characters,
  53. abbreviations, @.. escapes, and
  54. the terminator) */
  55. static huffentity_t **hufflist; /* Copy of the list, for sorting */
  56. int no_huff_entities; /* The number of entities in the list */
  57. int huff_unicode_start; /* Position in the list where Unicode
  58. chars begin. */
  59. int huff_abbrev_start; /* Position in the list where string
  60. abbreviations begin. */
  61. int huff_dynam_start; /* Position in the list where @..
  62. entities begin. */
  63. int huff_entity_root; /* The position in the list of the root
  64. entry (when considering the table
  65. as a tree). */
  66. int done_compression; /* Has the game text been compressed? */
  67. int32 compression_table_size; /* Length of the Huffman table, in
  68. bytes */
  69. int32 compression_string_size; /* Length of the compressed string
  70. data, in bytes */
  71. int32 *compressed_offsets; /* The beginning of every string in
  72. the game, relative to the beginning
  73. of the Huffman table. (So entry 0
  74. is equal to compression_table_size)*/
  75. #define UNICODE_HASH_BUCKETS (64)
  76. unicode_usage_t *unicode_usage_entries;
  77. static unicode_usage_t *unicode_usage_hash[UNICODE_HASH_BUCKETS];
  78. static int unicode_entity_index(int32 unicode);
  79. /* ------------------------------------------------------------------------- */
  80. /* Abbreviation arrays */
  81. /* ------------------------------------------------------------------------- */
  82. int *abbrev_values;
  83. int *abbrev_quality;
  84. int *abbrev_freqs;
  85. /* ------------------------------------------------------------------------- */
  86. int32 total_chars_trans, /* Number of ASCII chars of text in */
  87. total_bytes_trans, /* Number of bytes of Z-code text out */
  88. zchars_trans_in_last_string; /* Number of Z-chars in last string:
  89. needed only for abbrev efficiency
  90. calculation in "directs.c" */
  91. static int32 total_zchars_trans, /* Number of Z-chars of text out
  92. (only used to calculate the above) */
  93. no_chars_transcribed; /* Number of ASCII chars written to
  94. the text transcription area (used
  95. for the -r and -u switches) */
  96. static int zchars_out_buffer[3], /* During text translation, a buffer of
  97. 3 Z-chars at a time: when it's full
  98. these are written as a 2-byte word */
  99. zob_index; /* Index (0 to 2) into it */
  100. static unsigned char *text_out_pc; /* The "program counter" during text
  101. translation: the next address to
  102. write Z-coded text output to */
  103. static unsigned char *text_out_limit; /* The upper limit of text_out_pc
  104. during text translation */
  105. static int text_out_overflow; /* During text translation, becomes
  106. true if text_out_pc tries to pass
  107. text_out_limit */
  108. /* ------------------------------------------------------------------------- */
  109. /* For variables/arrays used by the dictionary manager, see below */
  110. /* ------------------------------------------------------------------------- */
  111. /* ------------------------------------------------------------------------- */
  112. /* Prepare the abbreviations lookup table (used to speed up abbreviation */
  113. /* detection in text translation). We first bubble-sort the abbrevs into */
  114. /* alphabetical order (this is necessary for the detection algorithm to */
  115. /* to work). Since the table is only prepared once, and for a table */
  116. /* of size at most 96, there's no point using an efficient sort algorithm. */
  117. /* ------------------------------------------------------------------------- */
  118. static void make_abbrevs_lookup(void)
  119. { int bubble_sort, j, k, l; char p[MAX_ABBREV_LENGTH]; char *p1, *p2;
  120. do
  121. { bubble_sort = FALSE;
  122. for (j=0; j<no_abbreviations; j++)
  123. for (k=j+1; k<no_abbreviations; k++)
  124. { p1=(char *)abbreviations_at+j*MAX_ABBREV_LENGTH;
  125. p2=(char *)abbreviations_at+k*MAX_ABBREV_LENGTH;
  126. if (strcmp(p1,p2)<0)
  127. { strcpy(p,p1); strcpy(p1,p2); strcpy(p2,p);
  128. l=abbrev_values[j]; abbrev_values[j]=abbrev_values[k];
  129. abbrev_values[k]=l;
  130. l=abbrev_quality[j]; abbrev_quality[j]=abbrev_quality[k];
  131. abbrev_quality[k]=l;
  132. bubble_sort = TRUE;
  133. }
  134. }
  135. } while (bubble_sort);
  136. for (j=no_abbreviations-1; j>=0; j--)
  137. { p1=(char *)abbreviations_at+j*MAX_ABBREV_LENGTH;
  138. abbrevs_lookup[(uchar)p1[0]]=j;
  139. abbrev_freqs[j]=0;
  140. }
  141. abbrevs_lookup_table_made = TRUE;
  142. }
  143. /* ------------------------------------------------------------------------- */
  144. /* Search the abbreviations lookup table (a routine which must be fast). */
  145. /* The source text to compare is text[i], text[i+1], ... and this routine */
  146. /* is only called if text[i] is indeed the first character of at least one */
  147. /* abbreviation, "from" begin the least index into the abbreviations table */
  148. /* of an abbreviation for which text[i] is the first character. Recall */
  149. /* that the abbrevs table is in alphabetical order. */
  150. /* */
  151. /* The return value is -1 if there is no match. If there is a match, the */
  152. /* text to be abbreviated out is over-written by a string of null chars */
  153. /* with "ASCII" value 1, and the abbreviation number is returned. */
  154. /* */
  155. /* In Glulx, we *do not* do this overwriting with 1's. */
  156. /* ------------------------------------------------------------------------- */
  157. static int try_abbreviations_from(unsigned char *text, int i, int from)
  158. { int j, k; uchar *p, c;
  159. c=text[i];
  160. for (j=from, p=(uchar *)abbreviations_at+from*MAX_ABBREV_LENGTH;
  161. (j<no_abbreviations)&&(c==p[0]); j++, p+=MAX_ABBREV_LENGTH)
  162. { if (text[i+1]==p[1])
  163. { for (k=2; p[k]!=0; k++)
  164. if (text[i+k]!=p[k]) goto NotMatched;
  165. if (!glulx_mode) {
  166. for (k=0; p[k]!=0; k++) text[i+k]=1;
  167. }
  168. abbrev_freqs[j]++;
  169. return(j);
  170. NotMatched: ;
  171. }
  172. }
  173. return(-1);
  174. }
  175. extern void make_abbreviation(char *text)
  176. {
  177. strcpy((char *)abbreviations_at
  178. + no_abbreviations*MAX_ABBREV_LENGTH, text);
  179. is_abbreviation = TRUE;
  180. abbrev_values[no_abbreviations] = compile_string(text, TRUE, TRUE);
  181. is_abbreviation = FALSE;
  182. /* The quality is the number of Z-chars saved by using this */
  183. /* abbreviation: note that it takes 2 Z-chars to print it. */
  184. abbrev_quality[no_abbreviations++] = zchars_trans_in_last_string - 2;
  185. }
  186. /* ------------------------------------------------------------------------- */
  187. /* The front end routine for text translation */
  188. /* ------------------------------------------------------------------------- */
  189. extern int32 compile_string(char *b, int in_low_memory, int is_abbrev)
  190. { int i, j; uchar *c;
  191. is_abbreviation = is_abbrev;
  192. /* Put into the low memory pool (at 0x100 in the Z-machine) of strings */
  193. /* which may be wanted as possible entries in the abbreviations table */
  194. if (!glulx_mode && in_low_memory)
  195. { j=subtract_pointers(low_strings_top,low_strings);
  196. low_strings_top=translate_text(low_strings_top, low_strings+MAX_LOW_STRINGS, b);
  197. if (!low_strings_top)
  198. memoryerror("MAX_LOW_STRINGS", MAX_LOW_STRINGS);
  199. is_abbreviation = FALSE;
  200. return(0x21+(j/2));
  201. }
  202. if (glulx_mode && done_compression)
  203. compiler_error("Tried to add a string after compression was done.");
  204. c = translate_text(strings_holding_area, strings_holding_area+MAX_STATIC_STRINGS, b);
  205. if (!c)
  206. memoryerror("MAX_STATIC_STRINGS",MAX_STATIC_STRINGS);
  207. i = subtract_pointers(c, strings_holding_area);
  208. /* Insert null bytes as needed to ensure that the next static string */
  209. /* also occurs at an address expressible as a packed address */
  210. if (!glulx_mode) {
  211. int textalign;
  212. if (oddeven_packing_switch)
  213. textalign = scale_factor*2;
  214. else
  215. textalign = scale_factor;
  216. while ((i%textalign)!=0)
  217. {
  218. if (i+2 > MAX_STATIC_STRINGS)
  219. memoryerror("MAX_STATIC_STRINGS",MAX_STATIC_STRINGS);
  220. i+=2; *c++ = 0; *c++ = 0;
  221. }
  222. }
  223. j = static_strings_extent;
  224. if (temporary_files_switch)
  225. for (c=strings_holding_area; c<strings_holding_area+i;
  226. c++, static_strings_extent++)
  227. fputc(*c,Temp1_fp);
  228. else
  229. for (c=strings_holding_area; c<strings_holding_area+i;
  230. c++, static_strings_extent++)
  231. write_byte_to_memory_block(&static_strings_area,
  232. static_strings_extent, *c);
  233. is_abbreviation = FALSE;
  234. if (!glulx_mode) {
  235. return(j/scale_factor);
  236. }
  237. else {
  238. /* The marker value is a one-based string number. (We reserve zero
  239. to mean "not a string at all". */
  240. return (++no_strings);
  241. }
  242. }
  243. /* ------------------------------------------------------------------------- */
  244. /* Output a single Z-character into the buffer, and flush it if full */
  245. /* ------------------------------------------------------------------------- */
  246. static void write_z_char_z(int i)
  247. { uint32 j;
  248. ASSERT_ZCODE();
  249. total_zchars_trans++;
  250. zchars_out_buffer[zob_index++]=(i%32);
  251. if (zob_index!=3) return;
  252. zob_index=0;
  253. j= zchars_out_buffer[0]*0x0400 + zchars_out_buffer[1]*0x0020
  254. + zchars_out_buffer[2];
  255. if (text_out_pc+2 > text_out_limit) {
  256. text_out_overflow = TRUE;
  257. return;
  258. }
  259. text_out_pc[0] = j/256; text_out_pc[1] = j%256; text_out_pc+=2;
  260. total_bytes_trans+=2;
  261. }
  262. static void write_zscii(int zsc)
  263. {
  264. int lookup_value, in_alphabet;
  265. if (zsc==' ')
  266. { write_z_char_z(0);
  267. return;
  268. }
  269. if (zsc < 0x100) lookup_value = zscii_to_alphabet_grid[zsc];
  270. else lookup_value = -1;
  271. if (lookup_value >= 0)
  272. { alphabet_used[lookup_value] = 'Y';
  273. in_alphabet = lookup_value/26;
  274. if (in_alphabet==1) write_z_char_z(4); /* SHIFT to A1 */
  275. if (in_alphabet==2) write_z_char_z(5); /* SHIFT to A2 */
  276. write_z_char_z(lookup_value%26 + 6);
  277. }
  278. else
  279. { write_z_char_z(5); write_z_char_z(6);
  280. write_z_char_z(zsc/32); write_z_char_z(zsc%32);
  281. }
  282. }
  283. /* ------------------------------------------------------------------------- */
  284. /* Finish a Z-coded string, padding out with Z-char 5s if necessary and */
  285. /* setting the "end" bit on the final 2-byte word */
  286. /* ------------------------------------------------------------------------- */
  287. static void end_z_chars(void)
  288. { unsigned char *p;
  289. zchars_trans_in_last_string=total_zchars_trans-zchars_trans_in_last_string;
  290. while (zob_index!=0) write_z_char_z(5);
  291. p=(unsigned char *) text_out_pc;
  292. *(p-2)= *(p-2)+128;
  293. }
  294. /* Glulx handles this much more simply -- compression is done elsewhere. */
  295. static void write_z_char_g(int i)
  296. {
  297. ASSERT_GLULX();
  298. if (text_out_pc+1 > text_out_limit) {
  299. text_out_overflow = TRUE;
  300. return;
  301. }
  302. total_zchars_trans++;
  303. text_out_pc[0] = i;
  304. text_out_pc++;
  305. total_bytes_trans++;
  306. }
  307. /* ------------------------------------------------------------------------- */
  308. /* The main routine "text.c" provides to the rest of Inform: the text */
  309. /* translator. p is the address to write output to, s_text the source text */
  310. /* and the return value is the next free address to write output to. */
  311. /* The return value will not exceed p_limit. If the translation tries to */
  312. /* overflow this boundary, the return value will be NULL (and you should */
  313. /* display an error). */
  314. /* Note that the source text may be corrupted by this routine. */
  315. /* ------------------------------------------------------------------------- */
  316. extern uchar *translate_text(uchar *p, uchar *p_limit, char *s_text)
  317. { int i, j, k, in_alphabet, lookup_value;
  318. int32 unicode; int zscii;
  319. unsigned char *text_in;
  320. /* Cast the input and output streams to unsigned char: text_out_pc will
  321. advance as bytes of Z-coded text are written, but text_in doesn't */
  322. text_in = (unsigned char *) s_text;
  323. text_out_pc = (unsigned char *) p;
  324. text_out_limit = (unsigned char *) p_limit;
  325. text_out_overflow = FALSE;
  326. /* Remember the Z-chars total so that later we can subtract to find the
  327. number of Z-chars translated on this string */
  328. zchars_trans_in_last_string = total_zchars_trans;
  329. /* Start with the Z-characters output buffer empty */
  330. zob_index=0;
  331. /* If this is the first text translated since the abbreviations were
  332. declared, and if some were declared, then it's time to make the
  333. lookup table for abbreviations
  334. (Except: we don't if the text being translated is itself
  335. the text of an abbreviation currently being defined) */
  336. if ((!abbrevs_lookup_table_made) && (no_abbreviations > 0)
  337. && (!is_abbreviation))
  338. make_abbrevs_lookup();
  339. /* If we're storing the whole game text to memory, then add this text */
  340. if ((!is_abbreviation) && (store_the_text))
  341. { no_chars_transcribed += strlen(s_text)+2;
  342. if (no_chars_transcribed >= MAX_TRANSCRIPT_SIZE)
  343. memoryerror("MAX_TRANSCRIPT_SIZE", MAX_TRANSCRIPT_SIZE);
  344. sprintf(all_text_top, "%s\n\n", s_text);
  345. all_text_top += strlen(all_text_top);
  346. }
  347. if (transcript_switch && (!veneer_mode))
  348. write_to_transcript_file(s_text);
  349. if (!glulx_mode) {
  350. /* The empty string of Z-text is illegal, since it can't carry an end
  351. bit: so we translate an empty string of ASCII text to just the
  352. pad character 5. Printing this causes nothing to appear on screen. */
  353. if (text_in[0]==0) write_z_char_z(5);
  354. /* Loop through the characters of the null-terminated input text: note
  355. that if 1 is written over a character in the input text, it is
  356. afterwards ignored */
  357. for (i=0; text_in[i]!=0; i++)
  358. { total_chars_trans++;
  359. /* Contract ". " into ". " if double-space-removing switch set:
  360. likewise "? " and "! " if the setting is high enough */
  361. if ((double_space_setting >= 1)
  362. && (text_in[i+1]==' ') && (text_in[i+2]==' '))
  363. { if (text_in[i]=='.') text_in[i+2]=1;
  364. if (double_space_setting >= 2)
  365. { if (text_in[i]=='?') text_in[i+2]=1;
  366. if (text_in[i]=='!') text_in[i+2]=1;
  367. }
  368. }
  369. /* Try abbreviations if the economy switch set */
  370. if ((economy_switch) && (!is_abbreviation)
  371. && ((k=abbrevs_lookup[text_in[i]])!=-1))
  372. { if ((j=try_abbreviations_from(text_in, i, k))!=-1)
  373. { if (j<32) { write_z_char_z(2); write_z_char_z(j); }
  374. else { write_z_char_z(3); write_z_char_z(j-32); }
  375. }
  376. }
  377. /* If Unicode switch set, use text_to_unicode to perform UTF-8
  378. decoding */
  379. if (character_set_unicode && (text_in[i] & 0x80))
  380. { unicode = text_to_unicode((char *) (text_in+i));
  381. zscii = unicode_to_zscii(unicode);
  382. if (zscii != 5) write_zscii(zscii);
  383. else
  384. { unicode_char_error(
  385. "Character can only be used if declared in \
  386. advance as part of 'Zcharacter table':", unicode);
  387. }
  388. i += textual_form_length - 1;
  389. continue;
  390. }
  391. /* '@' is the escape character in Inform string notation: the various
  392. possibilities are:
  393. (printing only)
  394. @@decimalnumber : write this ZSCII char (0 to 1023)
  395. @twodigits : write the abbreviation string with this
  396. decimal number
  397. (any string context)
  398. @accentcode : this accented character: e.g.,
  399. for @'e write an E-acute
  400. @{...} : this Unicode char (in hex) */
  401. if (text_in[i]=='@')
  402. { if (text_in[i+1]=='@')
  403. {
  404. /* @@... */
  405. i+=2; j=atoi((char *) (text_in+i));
  406. switch(j)
  407. { /* Prevent ~ and ^ from being translated to double-quote
  408. and new-line, as they ordinarily would be */
  409. case 94: write_z_char_z(5); write_z_char_z(6);
  410. write_z_char_z(94/32); write_z_char_z(94%32);
  411. break;
  412. case 126: write_z_char_z(5); write_z_char_z(6);
  413. write_z_char_z(126/32); write_z_char_z(126%32);
  414. break;
  415. default: write_zscii(j); break;
  416. }
  417. while (isdigit(text_in[i])) i++; i--;
  418. }
  419. else if (isdigit(text_in[i+1])!=0)
  420. { int d1, d2;
  421. /* @.. */
  422. d1 = character_digit_value[text_in[i+1]];
  423. d2 = character_digit_value[text_in[i+2]];
  424. if ((d1 == 127) || (d1 >= 10) || (d2 == 127) || (d2 >= 10))
  425. error("'@..' must have two decimal digits");
  426. else
  427. { i+=2;
  428. write_z_char_z(1); write_z_char_z(d1*10 + d2);
  429. }
  430. }
  431. else
  432. {
  433. /* A string escape specifying an unusual character */
  434. unicode = text_to_unicode((char *) (text_in+i));
  435. zscii = unicode_to_zscii(unicode);
  436. if (zscii != 5) write_zscii(zscii);
  437. else
  438. { unicode_char_error(
  439. "Character can only be used if declared in \
  440. advance as part of 'Zcharacter table':", unicode);
  441. }
  442. i += textual_form_length - 1;
  443. }
  444. }
  445. else
  446. { /* Skip a character which has been over-written with the null
  447. value 1 earlier on */
  448. if (text_in[i]!=1)
  449. { if (text_in[i]==' ') write_z_char_z(0);
  450. else
  451. { j = (int) text_in[i];
  452. lookup_value = iso_to_alphabet_grid[j];
  453. if (lookup_value < 0)
  454. { /* The character isn't in the standard alphabets, so
  455. we have to use the ZSCII 4-Z-char sequence */
  456. if (lookup_value == -5)
  457. { /* Character isn't in the ZSCII set at all */
  458. unicode = iso_to_unicode(j);
  459. unicode_char_error(
  460. "Character can only be used if declared in \
  461. advance as part of 'Zcharacter table':", unicode);
  462. write_zscii(0x200 + unicode/0x100);
  463. write_zscii(0x300 + unicode%0x100);
  464. }
  465. else write_zscii(-lookup_value);
  466. }
  467. else
  468. { /* The character is in one of the standard alphabets:
  469. write a SHIFT to temporarily change alphabet if
  470. it isn't in alphabet 0, then write the Z-char */
  471. alphabet_used[lookup_value] = 'Y';
  472. in_alphabet = lookup_value/26;
  473. if (in_alphabet==1) write_z_char_z(4); /* SHIFT to A1 */
  474. if (in_alphabet==2) write_z_char_z(5); /* SHIFT to A2 */
  475. write_z_char_z(lookup_value%26 + 6);
  476. }
  477. }
  478. }
  479. }
  480. }
  481. /* Flush the Z-characters output buffer and set the "end" bit */
  482. end_z_chars();
  483. }
  484. else {
  485. /* The text storage here is, of course, temporary. Compression
  486. will occur when we're finished compiling, so that all the
  487. clever Huffman stuff will work.
  488. In the stored text, we use "@@" to indicate @,
  489. "@0" to indicate a zero byte,
  490. "@ANNNN" to indicate an abbreviation,
  491. "@DNNNN" to indicate a dynamic string thing.
  492. "@UNNNN" to indicate a four-byte Unicode value (0x100 or higher).
  493. (NNNN is a four-digit hex number using the letters A-P... an
  494. ugly representation but a convenient one.)
  495. */
  496. for (i=0; text_in[i]!=0; i++) {
  497. /* Contract ". " into ". " if double-space-removing switch set:
  498. likewise "? " and "! " if the setting is high enough. */
  499. if ((double_space_setting >= 1)
  500. && (text_in[i+1]==' ') && (text_in[i+2]==' ')) {
  501. if (text_in[i]=='.'
  502. || (double_space_setting >= 2
  503. && (text_in[i]=='?' || text_in[i]=='!'))) {
  504. text_in[i+1] = text_in[i];
  505. i++;
  506. }
  507. }
  508. total_chars_trans++;
  509. /* Try abbreviations if the economy switch set. We have to be in
  510. compression mode too, since the abbreviation mechanism is part
  511. of string decompression. */
  512. if ((economy_switch) && (compression_switch) && (!is_abbreviation)
  513. && ((k=abbrevs_lookup[text_in[i]])!=-1)
  514. && ((j=try_abbreviations_from(text_in, i, k)) != -1)) {
  515. char *cx = (char *)abbreviations_at+j*MAX_ABBREV_LENGTH;
  516. i += (strlen(cx)-1);
  517. write_z_char_g('@');
  518. write_z_char_g('A');
  519. write_z_char_g('A' + ((j >>12) & 0x0F));
  520. write_z_char_g('A' + ((j >> 8) & 0x0F));
  521. write_z_char_g('A' + ((j >> 4) & 0x0F));
  522. write_z_char_g('A' + ((j ) & 0x0F));
  523. }
  524. else if (text_in[i] == '@') {
  525. if (text_in[i+1]=='@') {
  526. /* An ASCII code */
  527. i+=2; j=atoi((char *) (text_in+i));
  528. if (j == '@' || j == '\0') {
  529. write_z_char_g('@');
  530. if (j == 0) {
  531. j = '0';
  532. if (!compression_switch)
  533. warning("Ascii @@0 will prematurely terminate non-compressed \
  534. string.");
  535. }
  536. }
  537. write_z_char_g(j);
  538. while (isdigit(text_in[i])) i++; i--;
  539. }
  540. else if (isdigit(text_in[i+1])) {
  541. int d1, d2;
  542. d1 = character_digit_value[text_in[i+1]];
  543. d2 = character_digit_value[text_in[i+2]];
  544. if ((d1 == 127) || (d1 >= 10) || (d2 == 127) || (d2 >= 10)) {
  545. error("'@..' must have two decimal digits");
  546. }
  547. else {
  548. if (!compression_switch)
  549. warning("'@..' print variable will not work in non-compressed \
  550. string; substituting ' '.");
  551. i += 2;
  552. j = d1*10 + d2;
  553. if (j >= MAX_DYNAMIC_STRINGS) {
  554. memoryerror("MAX_DYNAMIC_STRINGS", MAX_DYNAMIC_STRINGS);
  555. j = 0;
  556. }
  557. if (j+1 >= no_dynamic_strings)
  558. no_dynamic_strings = j+1;
  559. write_z_char_g('@');
  560. write_z_char_g('D');
  561. write_z_char_g('A' + ((j >>12) & 0x0F));
  562. write_z_char_g('A' + ((j >> 8) & 0x0F));
  563. write_z_char_g('A' + ((j >> 4) & 0x0F));
  564. write_z_char_g('A' + ((j ) & 0x0F));
  565. }
  566. }
  567. else {
  568. unicode = text_to_unicode((char *) (text_in+i));
  569. i += textual_form_length - 1;
  570. if (unicode >= 0 && unicode < 256) {
  571. write_z_char_g(unicode);
  572. }
  573. else {
  574. if (!compression_switch) {
  575. warning("Unicode characters will not work in non-compressed \
  576. string; substituting '?'.");
  577. write_z_char_g('?');
  578. }
  579. else {
  580. j = unicode_entity_index(unicode);
  581. write_z_char_g('@');
  582. write_z_char_g('U');
  583. write_z_char_g('A' + ((j >>12) & 0x0F));
  584. write_z_char_g('A' + ((j >> 8) & 0x0F));
  585. write_z_char_g('A' + ((j >> 4) & 0x0F));
  586. write_z_char_g('A' + ((j ) & 0x0F));
  587. }
  588. }
  589. }
  590. }
  591. else if (text_in[i] == '^')
  592. write_z_char_g(0x0A);
  593. else if (text_in[i] == '~')
  594. write_z_char_g('"');
  595. else if (character_set_unicode) {
  596. if (text_in[i] & 0x80) {
  597. unicode = text_to_unicode((char *) (text_in+i));
  598. i += textual_form_length - 1;
  599. if (unicode >= 0 && unicode < 256) {
  600. write_z_char_g(unicode);
  601. }
  602. else {
  603. if (!compression_switch) {
  604. warning("Unicode characters will not work in non-compressed \
  605. string; substituting '?'.");
  606. write_z_char_g('?');
  607. }
  608. else {
  609. j = unicode_entity_index(unicode);
  610. write_z_char_g('@');
  611. write_z_char_g('U');
  612. write_z_char_g('A' + ((j >>12) & 0x0F));
  613. write_z_char_g('A' + ((j >> 8) & 0x0F));
  614. write_z_char_g('A' + ((j >> 4) & 0x0F));
  615. write_z_char_g('A' + ((j ) & 0x0F));
  616. }
  617. }
  618. }
  619. else {
  620. write_z_char_g(text_in[i]);
  621. }
  622. }
  623. else {
  624. unicode = iso_to_unicode_grid[text_in[i]];
  625. if (unicode >= 0 && unicode < 256) {
  626. write_z_char_g(unicode);
  627. }
  628. else {
  629. if (!compression_switch) {
  630. warning("Unicode characters will not work in non-compressed \
  631. string; substituting '?'.");
  632. write_z_char_g('?');
  633. }
  634. else {
  635. j = unicode_entity_index(unicode);
  636. write_z_char_g('@');
  637. write_z_char_g('U');
  638. write_z_char_g('A' + ((j >>12) & 0x0F));
  639. write_z_char_g('A' + ((j >> 8) & 0x0F));
  640. write_z_char_g('A' + ((j >> 4) & 0x0F));
  641. write_z_char_g('A' + ((j ) & 0x0F));
  642. }
  643. }
  644. }
  645. }
  646. write_z_char_g(0);
  647. }
  648. if (text_out_overflow)
  649. return NULL;
  650. else
  651. return((uchar *) text_out_pc);
  652. }
  653. static int unicode_entity_index(int32 unicode)
  654. {
  655. unicode_usage_t *uptr;
  656. int j;
  657. int buck = unicode % UNICODE_HASH_BUCKETS;
  658. for (uptr = unicode_usage_hash[buck]; uptr; uptr=uptr->next) {
  659. if (uptr->ch == unicode)
  660. break;
  661. }
  662. if (uptr) {
  663. j = (uptr - unicode_usage_entries);
  664. }
  665. else {
  666. if (no_unicode_chars >= MAX_UNICODE_CHARS) {
  667. memoryerror("MAX_UNICODE_CHARS", MAX_UNICODE_CHARS);
  668. j = 0;
  669. }
  670. else {
  671. j = no_unicode_chars;
  672. no_unicode_chars++;
  673. uptr = unicode_usage_entries + j;
  674. uptr->ch = unicode;
  675. uptr->next = unicode_usage_hash[buck];
  676. unicode_usage_hash[buck] = uptr;
  677. }
  678. }
  679. return j;
  680. }
  681. /* ------------------------------------------------------------------------- */
  682. /* Glulx compression code */
  683. /* ------------------------------------------------------------------------- */
  684. static void compress_makebits(int entnum, int depth, int prevbit,
  685. huffbitlist_t *bits);
  686. /* The compressor. This uses the usual Huffman compression algorithm. */
  687. void compress_game_text()
  688. {
  689. int entities, branchstart, branches;
  690. int numlive;
  691. int32 lx;
  692. int jx;
  693. int ch;
  694. int32 ix;
  695. huffbitlist_t bits;
  696. if (compression_switch) {
  697. /* How many entities have we currently got? Well, 256 plus the
  698. string-terminator plus Unicode chars plus abbrevations plus
  699. dynamic strings. */
  700. entities = 256+1;
  701. huff_unicode_start = entities;
  702. entities += no_unicode_chars;
  703. huff_abbrev_start = entities;
  704. if (economy_switch)
  705. entities += no_abbreviations;
  706. huff_dynam_start = entities;
  707. entities += no_dynamic_strings;
  708. if (entities > MAX_CHARACTER_SET)
  709. memoryerror("MAX_CHARACTER_SET",MAX_CHARACTER_SET);
  710. /* Characters */
  711. for (jx=0; jx<256; jx++) {
  712. huff_entities[jx].type = 2;
  713. huff_entities[jx].count = 0;
  714. huff_entities[jx].u.ch = jx;
  715. }
  716. /* Terminator */
  717. huff_entities[256].type = 1;
  718. huff_entities[256].count = 0;
  719. for (jx=0; jx<no_unicode_chars; jx++) {
  720. huff_entities[huff_unicode_start+jx].type = 4;
  721. huff_entities[huff_unicode_start+jx].count = 0;
  722. huff_entities[huff_unicode_start+jx].u.val = jx;
  723. }
  724. if (economy_switch) {
  725. for (jx=0; jx<no_abbreviations; jx++) {
  726. huff_entities[huff_abbrev_start+jx].type = 3;
  727. huff_entities[huff_abbrev_start+jx].count = 0;
  728. huff_entities[huff_abbrev_start+jx].u.val = jx;
  729. }
  730. }
  731. for (jx=0; jx<no_dynamic_strings; jx++) {
  732. huff_entities[huff_dynam_start+jx].type = 9;
  733. huff_entities[huff_dynam_start+jx].count = 0;
  734. huff_entities[huff_dynam_start+jx].u.val = jx;
  735. }
  736. }
  737. else {
  738. /* No compression; use defaults that will make it easy to check
  739. for errors. */
  740. no_huff_entities = 257;
  741. huff_unicode_start = 257;
  742. huff_abbrev_start = 257;
  743. huff_dynam_start = 257+MAX_ABBREVS;
  744. compression_table_size = 0;
  745. }
  746. if (temporary_files_switch) {
  747. fclose(Temp1_fp);
  748. Temp1_fp=fopen(Temp1_Name,"rb");
  749. if (Temp1_fp==NULL)
  750. fatalerror("I/O failure: couldn't reopen temporary file 1");
  751. }
  752. if (compression_switch) {
  753. for (lx=0, ix=0; lx<no_strings; lx++) {
  754. int escapelen=0, escapetype=0;
  755. int done=FALSE;
  756. int32 escapeval;
  757. while (!done) {
  758. if (temporary_files_switch)
  759. ch = fgetc(Temp1_fp);
  760. else
  761. ch = read_byte_from_memory_block(&static_strings_area, ix);
  762. ix++;
  763. if (ix > static_strings_extent || ch < 0)
  764. compiler_error("Read too much not-yet-compressed text.");
  765. if (escapelen == -1) {
  766. escapelen = 0;
  767. if (ch == '@') {
  768. ch = '@';
  769. }
  770. else if (ch == '0') {
  771. ch = '\0';
  772. }
  773. else if (ch == 'A' || ch == 'D' || ch == 'U') {
  774. escapelen = 4;
  775. escapetype = ch;
  776. escapeval = 0;
  777. continue;
  778. }
  779. else {
  780. compiler_error("Strange @ escape in processed text.");
  781. }
  782. }
  783. else if (escapelen) {
  784. escapeval = (escapeval << 4) | ((ch-'A') & 0x0F);
  785. escapelen--;
  786. if (escapelen == 0) {
  787. if (escapetype == 'A') {
  788. ch = huff_abbrev_start+escapeval;
  789. }
  790. else if (escapetype == 'D') {
  791. ch = huff_dynam_start+escapeval;
  792. }
  793. else if (escapetype == 'U') {
  794. ch = huff_unicode_start+escapeval;
  795. }
  796. else {
  797. compiler_error("Strange @ escape in processed text.");
  798. }
  799. }
  800. else
  801. continue;
  802. }
  803. else {
  804. if (ch == '@') {
  805. escapelen = -1;
  806. continue;
  807. }
  808. if (ch == 0) {
  809. ch = 256;
  810. done = TRUE;
  811. }
  812. }
  813. huff_entities[ch].count++;
  814. }
  815. }
  816. numlive = 0;
  817. for (jx=0; jx<entities; jx++) {
  818. if (huff_entities[jx].count) {
  819. hufflist[numlive] = &(huff_entities[jx]);
  820. numlive++;
  821. }
  822. }
  823. branchstart = entities;
  824. branches = 0;
  825. while (numlive > 1) {
  826. int best1, best2;
  827. int best1num, best2num;
  828. huffentity_t *bran;
  829. if (hufflist[0]->count < hufflist[1]->count) {
  830. best1 = 0;
  831. best2 = 1;
  832. }
  833. else {
  834. best2 = 0;
  835. best1 = 1;
  836. }
  837. best1num = hufflist[best1]->count;
  838. best2num = hufflist[best2]->count;
  839. for (jx=2; jx<numlive; jx++) {
  840. if (hufflist[jx]->count < best1num) {
  841. best2 = best1;
  842. best2num = best1num;
  843. best1 = jx;
  844. best1num = hufflist[best1]->count;
  845. }
  846. else if (hufflist[jx]->count < best2num) {
  847. best2 = jx;
  848. best2num = hufflist[best2]->count;
  849. }
  850. }
  851. bran = &(huff_entities[branchstart+branches]);
  852. branches++;
  853. bran->type = 0;
  854. bran->count = hufflist[best1]->count + hufflist[best2]->count;
  855. bran->u.branch[0] = (hufflist[best1] - huff_entities);
  856. bran->u.branch[1] = (hufflist[best2] - huff_entities);
  857. hufflist[best1] = bran;
  858. if (best2 < numlive-1) {
  859. memmove(&(hufflist[best2]), &(hufflist[best2+1]),
  860. ((numlive-1) - best2) * sizeof(huffentity_t *));
  861. }
  862. numlive--;
  863. }
  864. huff_entity_root = (hufflist[0] - huff_entities);
  865. for (ix=0; ix<MAXHUFFBYTES; ix++)
  866. bits.b[ix] = 0;
  867. compression_table_size = 12;
  868. no_huff_entities = 0; /* compress_makebits will total this up */
  869. compress_makebits(huff_entity_root, 0, -1, &bits);
  870. }
  871. /* Now, sadly, we have to compute the size of the string section,
  872. without actually doing the compression. */
  873. compression_string_size = 0;
  874. if (temporary_files_switch) {
  875. fseek(Temp1_fp, 0, SEEK_SET);
  876. }
  877. if (no_strings >= MAX_NUM_STATIC_STRINGS)
  878. memoryerror("MAX_NUM_STATIC_STRINGS", MAX_NUM_STATIC_STRINGS);
  879. for (lx=0, ix=0; lx<no_strings; lx++) {
  880. int escapelen=0, escapetype=0;
  881. int done=FALSE;
  882. int32 escapeval;
  883. jx = 0;
  884. compressed_offsets[lx] = compression_table_size + compression_string_size;
  885. compression_string_size++; /* for the type byte */
  886. while (!done) {
  887. if (temporary_files_switch)
  888. ch = fgetc(Temp1_fp);
  889. else
  890. ch = read_byte_from_memory_block(&static_strings_area, ix);
  891. ix++;
  892. if (ix > static_strings_extent || ch < 0)
  893. compiler_error("Read too much not-yet-compressed text.");
  894. if (escapelen == -1) {
  895. escapelen = 0;
  896. if (ch == '@') {
  897. ch = '@';
  898. }
  899. else if (ch == '0') {
  900. ch = '\0';
  901. }
  902. else if (ch == 'A' || ch == 'D' || ch == 'U') {
  903. escapelen = 4;
  904. escapetype = ch;
  905. escapeval = 0;
  906. continue;
  907. }
  908. else {
  909. compiler_error("Strange @ escape in processed text.");
  910. }
  911. }
  912. else if (escapelen) {
  913. escapeval = (escapeval << 4) | ((ch-'A') & 0x0F);
  914. escapelen--;
  915. if (escapelen == 0) {
  916. if (escapetype == 'A') {
  917. ch = huff_abbrev_start+escapeval;
  918. }
  919. else if (escapetype == 'D') {
  920. ch = huff_dynam_start+escapeval;
  921. }
  922. else if (escapetype == 'U') {
  923. ch = huff_unicode_start+escapeval;
  924. }
  925. else {
  926. compiler_error("Strange @ escape in processed text.");
  927. }
  928. }
  929. else
  930. continue;
  931. }
  932. else {
  933. if (ch == '@') {
  934. escapelen = -1;
  935. continue;
  936. }
  937. if (ch == 0) {
  938. ch = 256;
  939. done = TRUE;
  940. }
  941. }
  942. if (compression_switch) {
  943. jx += huff_entities[ch].depth;
  944. compression_string_size += (jx/8);
  945. jx = (jx % 8);
  946. }
  947. else {
  948. if (ch >= huff_dynam_start) {
  949. compression_string_size += 3;
  950. }
  951. else if (ch >= huff_unicode_start) {
  952. compiler_error("Abbreviation/Unicode in non-compressed string \
  953. should be impossible.");
  954. }
  955. else
  956. compression_string_size += 1;
  957. }
  958. }
  959. if (compression_switch && jx)
  960. compression_string_size++;
  961. }
  962. done_compression = TRUE;
  963. }
  964. static void compress_makebits(int entnum, int depth, int prevbit,
  965. huffbitlist_t *bits)
  966. {
  967. huffentity_t *ent = &(huff_entities[entnum]);
  968. char *cx;
  969. no_huff_entities++;
  970. ent->addr = compression_table_size;
  971. ent->depth = depth;
  972. ent->bits = *bits;
  973. if (depth > 0) {
  974. if (prevbit)
  975. ent->bits.b[(depth-1) / 8] |= (1 << ((depth-1) % 8));
  976. }
  977. switch (ent->type) {
  978. case 0:
  979. compression_table_size += 9;
  980. compress_makebits(ent->u.branch[0], depth+1, 0, &ent->bits);
  981. compress_makebits(ent->u.branch[1], depth+1, 1, &ent->bits);
  982. break;
  983. case 1:
  984. compression_table_size += 1;
  985. break;
  986. case 2:
  987. compression_table_size += 2;
  988. break;
  989. case 3:
  990. cx = (char *)abbreviations_at + ent->u.val*MAX_ABBREV_LENGTH;
  991. compression_table_size += (1 + 1 + strlen(cx));
  992. break;
  993. case 4:
  994. case 9:
  995. compression_table_size += 5;
  996. break;
  997. }
  998. }
  999. /* ------------------------------------------------------------------------- */
  1000. /* The abbreviations optimiser */
  1001. /* */
  1002. /* This is a very complex, memory and time expensive algorithm to */
  1003. /* approximately solve the problem of which abbreviation strings would */
  1004. /* minimise the total number of Z-chars to which the game text translates. */
  1005. /* It is in some ways a quite separate program but remains inside Inform */
  1006. /* for compatibility with previous releases. */
  1007. /* ------------------------------------------------------------------------- */
  1008. typedef struct tlb_s
  1009. { char text[4];
  1010. int32 intab, occurrences;
  1011. } tlb;
  1012. static tlb *tlbtab;
  1013. static int32 no_occs;
  1014. static int32 *grandtable;
  1015. static int32 *grandflags;
  1016. typedef struct optab_s
  1017. { int32 length;
  1018. int32 popularity;
  1019. int32 score;
  1020. int32 location;
  1021. char text[64];
  1022. } optab;
  1023. static optab *bestyet, *bestyet2;
  1024. static int pass_no;
  1025. static char *sub_buffer;
  1026. static void optimise_pass(void)
  1027. { int32 i; int t1, t2;
  1028. int32 j, j2, k, nl, matches, noflags, score, min, minat, x, scrabble, c;
  1029. for (i=0; i<256; i++) bestyet[i].length=0;
  1030. for (i=0; i<no_occs; i++)
  1031. { if ((*(tlbtab[i].text)!=(int) '\n')&&(tlbtab[i].occurrences!=0))
  1032. {
  1033. #ifdef MAC_FACE
  1034. if (i%((**g_pm_hndl).linespercheck) == 0)
  1035. { ProcessEvents (&g_proc);
  1036. if (g_proc != true)
  1037. { free_arrays();
  1038. if (store_the_text)
  1039. my_free(&all_text,"transcription text");
  1040. longjmp (g_fallback, 1);
  1041. }
  1042. }
  1043. #endif
  1044. printf("Pass %d, %4ld/%ld '%s' (%ld occurrences) ",
  1045. pass_no, (long int) i, (long int) no_occs, tlbtab[i].text,
  1046. (long int) tlbtab[i].occurrences);
  1047. t1=(int) (time(0));
  1048. for (j=0; j<tlbtab[i].occurrences; j++)
  1049. { for (j2=0; j2<tlbtab[i].occurrences; j2++) grandflags[j2]=1;
  1050. nl=2; noflags=tlbtab[i].occurrences;
  1051. while ((noflags>=2)&&(nl<=62))
  1052. { nl++;
  1053. for (j2=0; j2<nl; j2++)
  1054. if (all_text[grandtable[tlbtab[i].intab+j]+j2]=='\n')
  1055. goto FinishEarly;
  1056. matches=0;
  1057. for (j2=j; j2<tlbtab[i].occurrences; j2++)
  1058. { if (grandflags[j2]==1)
  1059. { x=grandtable[tlbtab[i].intab+j2]
  1060. - grandtable[tlbtab[i].intab+j];
  1061. if (((x>-nl)&&(x<nl))
  1062. || (memcmp(all_text+grandtable[tlbtab[i].intab+j],
  1063. all_text+grandtable[tlbtab[i].intab+j2],
  1064. nl)!=0))
  1065. { grandflags[j2]=0; noflags--; }
  1066. else matches++;
  1067. }
  1068. }
  1069. scrabble=0;
  1070. for (k=0; k<nl; k++)
  1071. { scrabble++;
  1072. c=all_text[grandtable[tlbtab[i].intab+j+k]];
  1073. if (c!=(int) ' ')
  1074. { if (iso_to_alphabet_grid[c]<0)
  1075. scrabble+=2;
  1076. else
  1077. if (iso_to_alphabet_grid[c]>=26)
  1078. scrabble++;
  1079. }
  1080. }
  1081. score=(matches-1)*(scrabble-2);
  1082. min=score;
  1083. for (j2=0; j2<256; j2++)
  1084. { if ((nl==bestyet[j2].length)
  1085. && (memcmp(all_text+bestyet[j2].location,
  1086. all_text+grandtable[tlbtab[i].intab+j],
  1087. nl)==0))
  1088. { j2=256; min=score; }
  1089. else
  1090. { if (bestyet[j2].score<min)
  1091. { min=bestyet[j2].score; minat=j2;
  1092. }
  1093. }
  1094. }
  1095. if (min!=score)
  1096. { bestyet[minat].score=score;
  1097. bestyet[minat].length=nl;
  1098. bestyet[minat].location=grandtable[tlbtab[i].intab+j];
  1099. bestyet[minat].popularity=matches;
  1100. for (j2=0; j2<nl; j2++) sub_buffer[j2]=
  1101. all_text[bestyet[minat].location+j2];
  1102. sub_buffer[nl]=0;
  1103. }
  1104. }
  1105. FinishEarly: ;
  1106. }
  1107. t2=((int) time(0)) - t1;
  1108. printf(" (%d seconds)\n",t2);
  1109. }
  1110. }
  1111. }
  1112. static int any_overlap(char *s1, char *s2)
  1113. { int a, b, i, j, flag;
  1114. a=strlen(s1); b=strlen(s2);
  1115. for (i=1-b; i<a; i++)
  1116. { flag=0;
  1117. for (j=0; j<b; j++)
  1118. if ((0<=i+j)&&(i+j<=a-1))
  1119. if (s1[i+j]!=s2[j]) flag=1;
  1120. if (flag==0) return(1);
  1121. }
  1122. return(0);
  1123. }
  1124. #define MAX_TLBS 8000
  1125. extern void optimise_abbreviations(void)
  1126. { int32 i, j, t, max=0, MAX_GTABLE;
  1127. int32 j2, selected, available, maxat, nl;
  1128. tlb test;
  1129. printf("Beginning calculation of optimal abbreviations...\n");
  1130. pass_no = 0;
  1131. tlbtab=my_calloc(sizeof(tlb), MAX_TLBS, "tlb table"); no_occs=0;
  1132. sub_buffer=my_calloc(sizeof(char), 4000, "sub_buffer");
  1133. for (i=0; i<MAX_TLBS; i++) tlbtab[i].occurrences=0;
  1134. bestyet=my_calloc(sizeof(optab), 256, "bestyet");
  1135. bestyet2=my_calloc(sizeof(optab), 64, "bestyet2");
  1136. bestyet2[0].text[0]='.';
  1137. bestyet2[0].text[1]=' ';
  1138. bestyet2[0].text[2]=0;
  1139. bestyet2[1].text[0]=',';
  1140. bestyet2[1].text[1]=' ';
  1141. bestyet2[1].text[2]=0;
  1142. for (i=0; all_text+i<all_text_top; i++)
  1143. {
  1144. if ((all_text[i]=='.') && (all_text[i+1]==' ') && (all_text[i+2]==' '))
  1145. { all_text[i]='\n'; all_text[i+1]='\n'; all_text[i+2]='\n';
  1146. bestyet2[0].popularity++;
  1147. }
  1148. if ((all_text[i]=='.') && (all_text[i+1]==' '))
  1149. { all_text[i]='\n'; all_text[i+1]='\n';
  1150. bestyet2[0].popularity++;
  1151. }
  1152. if ((all_text[i]==',') && (all_text[i+1]==' '))
  1153. { all_text[i]='\n'; all_text[i+1]='\n';
  1154. bestyet2[1].popularity++;
  1155. }
  1156. }
  1157. MAX_GTABLE=subtract_pointers(all_text_top,all_text)+1;
  1158. grandtable=my_calloc(4*sizeof(int32), MAX_GTABLE/4, "grandtable");
  1159. for (i=0, t=0; all_text+i<all_text_top; i++)
  1160. { test.text[0]=all_text[i];
  1161. test.text[1]=all_text[i+1];
  1162. test.text[2]=all_text[i+2];
  1163. test.text[3]=0;
  1164. if ((test.text[0]=='\n')||(test.text[1]=='\n')||(test.text[2]=='\n'))
  1165. goto DontKeep;
  1166. for (j=0; j<no_occs; j++)
  1167. if (strcmp(test.text,tlbtab[j].text)==0)
  1168. goto DontKeep;
  1169. test.occurrences=0;
  1170. for (j=i+3; all_text+j<all_text_top; j++)
  1171. {
  1172. #ifdef MAC_FACE
  1173. if (j%((**g_pm_hndl).linespercheck) == 0)
  1174. { ProcessEvents (&g_proc);
  1175. if (g_proc != true)
  1176. { free_arrays();
  1177. if (store_the_text)
  1178. my_free(&all_text,"transcription text");
  1179. longjmp (g_fallback, 1);
  1180. }
  1181. }
  1182. #endif
  1183. if ((all_text[i]==all_text[j])
  1184. && (all_text[i+1]==all_text[j+1])
  1185. && (all_text[i+2]==all_text[j+2]))
  1186. { grandtable[t+test.occurrences]=j;
  1187. test.occurrences++;
  1188. if (t+test.occurrences==MAX_GTABLE)
  1189. { printf("All %ld cross-references used\n",
  1190. (long int) MAX_GTABLE);
  1191. goto Built;
  1192. }
  1193. }
  1194. }
  1195. if (test.occurrences>=2)
  1196. { tlbtab[no_occs]=test;
  1197. tlbtab[no_occs].intab=t; t+=tlbtab[no_occs].occurrences;
  1198. if (max<tlbtab[no_occs].occurrences)
  1199. max=tlbtab[no_occs].occurrences;
  1200. no_occs++;
  1201. if (no_occs==MAX_TLBS)
  1202. { printf("All %d three-letter-blocks used\n",
  1203. MAX_TLBS);
  1204. goto Built;
  1205. }
  1206. }
  1207. DontKeep: ;
  1208. }
  1209. Built:
  1210. grandflags=my_calloc(sizeof(int), max, "grandflags");
  1211. printf("Cross-reference table (%ld entries) built...\n",
  1212. (long int) no_occs);
  1213. /* for (i=0; i<no_occs; i++)
  1214. printf("%4d %4d '%s' %d\n",i,tlbtab[i].intab,tlbtab[i].text,
  1215. tlbtab[i].occurrences);
  1216. */
  1217. for (i=0; i<64; i++) bestyet2[i].length=0; selected=2;
  1218. available=256;
  1219. while ((available>0)&&(selected<64))
  1220. { printf("Pass %d\n", ++pass_no);
  1221. optimise_pass();
  1222. available=0;
  1223. for (i=0; i<256; i++)
  1224. if (bestyet[i].score!=0)
  1225. { available++;
  1226. nl=bestyet[i].length;
  1227. for (j2=0; j2<nl; j2++) bestyet[i].text[j2]=
  1228. all_text[bestyet[i].location+j2];
  1229. bestyet[i].text[nl]=0;
  1230. }
  1231. /* printf("End of pass results:\n");
  1232. printf("\nno score freq string\n");
  1233. for (i=0; i<256; i++)
  1234. if (bestyet[i].score>0)
  1235. printf("%02d: %4d %4d '%s'\n", i, bestyet[i].score,
  1236. bestyet[i].popularity, bestyet[i].text);
  1237. */
  1238. do
  1239. { max=0;
  1240. for (i=0; i<256; i++)
  1241. if (max<bestyet[i].score)
  1242. { max=bestyet[i].score;
  1243. maxat=i;
  1244. }
  1245. if (max>0)
  1246. { bestyet2[selected++]=bestyet[maxat];
  1247. printf(
  1248. "Selection %2ld: '%s' (repeated %ld times, scoring %ld)\n",
  1249. (long int) selected,bestyet[maxat].text,
  1250. (long int) bestyet[maxat].popularity,
  1251. (long int) bestyet[maxat].score);
  1252. test.text[0]=bestyet[maxat].text[0];
  1253. test.text[1]=bestyet[maxat].text[1];
  1254. test.text[2]=bestyet[maxat].text[2];
  1255. test.text[3]=0;
  1256. for (i=0; i<no_occs; i++)
  1257. if (strcmp(test.text,tlbtab[i].text)==0)
  1258. break;
  1259. for (j=0; j<tlbtab[i].occurrences; j++)
  1260. { if (memcmp(bestyet[maxat].text,
  1261. all_text+grandtable[tlbtab[i].intab+j],
  1262. bestyet[maxat].length)==0)
  1263. { for (j2=0; j2<bestyet[maxat].length; j2++)
  1264. all_text[grandtable[tlbtab[i].intab+j]+j2]='\n';
  1265. }
  1266. }
  1267. for (i=0; i<256; i++)
  1268. if ((bestyet[i].score>0)&&
  1269. (any_overlap(bestyet[maxat].text,bestyet[i].text)==1))
  1270. { bestyet[i].score=0;
  1271. /* printf("Discarding '%s' as overlapping\n",
  1272. bestyet[i].text); */
  1273. }
  1274. }
  1275. } while ((max>0)&&(available>0)&&(selected<64));
  1276. }
  1277. printf("\nChosen abbreviations (in Inform syntax):\n\n");
  1278. for (i=0; i<selected; i++)
  1279. printf("Abbreviate \"%s\";\n", bestyet2[i].text);
  1280. text_free_arrays();
  1281. }
  1282. /* ------------------------------------------------------------------------- */
  1283. /* The dictionary manager begins here. */
  1284. /* */
  1285. /* Speed is extremely important in these algorithms. If a linear-time */
  1286. /* routine were used to search the dictionary words so far built up, then */
  1287. /* Inform would crawl. */
  1288. /* */
  1289. /* Instead, the dictionary is stored as a binary tree, which is kept */
  1290. /* balanced with the red-black algorithm. */
  1291. /* ------------------------------------------------------------------------- */
  1292. /* A dictionary table similar to the Z-machine format is kept: there is a */
  1293. /* 7-byte header (left blank here to be filled in at the */
  1294. /* construct_storyfile() stage in "tables.c") and then a sequence of */
  1295. /* records, one per word, in the form */
  1296. /* */
  1297. /* <Z-coded text> <flags> <verbnumber> <adjectivenumber> */
  1298. /* 4 or 6 bytes byte byte byte */
  1299. /* */
  1300. /* For Glulx, the form is instead: (But see below about Unicode-valued */
  1301. /* dictionaries and my heinie.) */
  1302. /* */
  1303. /* <plain text> <flags> <verbnumber> <adjectivenumber> */
  1304. /* DICT_WORD_SIZE short short short */
  1305. /* */
  1306. /* These records are stored in "accession order" (i.e. in order of their */
  1307. /* first being received by these routines) and only alphabetically sorted */
  1308. /* by construct_storyfile() (using the array below). */
  1309. /* ------------------------------------------------------------------------- */
  1310. /* */
  1311. /* Further notes about the data fields... */
  1312. /* The flags are currently: */
  1313. /* bit 0: word is used as a verb (in verb grammar) */
  1314. /* bit 1: word is used as a meta verb */
  1315. /* bit 2: word is plural (set by '//p') */
  1316. /* bit 3: word is used as a preposition (in verb grammar) */
  1317. /* bit 6: set for all verbs, but not used by the parser? */
  1318. /* bit 7: word is used as a noun (set for every word that appears in */
  1319. /* code or in an object property) */
  1320. /* */
  1321. /* In grammar version 2, the third field (adjectivenumber) is unused (and */
  1322. /* zero). */
  1323. /* */
  1324. /* The compiler generates special constants #dict_par1, #dict_par2, */
  1325. /* #dict_par3 to refer to the byte offsets of the three fields. In */
  1326. /* Z-code v3, these are 4/5/6; in v4+, they are 6/7/8. In Glulx, they */
  1327. /* are $DICT_WORD_SIZE+2/4/6, referring to the *low* bytes of the three */
  1328. /* fields. (The high bytes are $DICT_WORD_SIZE+1/3/5.) */
  1329. /* ------------------------------------------------------------------------- */
  1330. uchar *dictionary, /* (These two pointers are externally
  1331. used only in "tables.c" when
  1332. building the story-file) */
  1333. *dictionary_top; /* Pointer to next free record */
  1334. int dict_entries; /* Total number of records entered */
  1335. /* ------------------------------------------------------------------------- */
  1336. /* dict_word is a typedef for a struct of 6 unsigned chars (defined in */
  1337. /* "header.h"): it holds the (4 or) 6 bytes of Z-coded text of a word. */
  1338. /* Usefully, because the PAD character 5 is < all alphabetic characters, */
  1339. /* alphabetic order corresponds to numeric order. For this reason, the */
  1340. /* dict_word is called the "sort code" of the original text word. */
  1341. /* */
  1342. /* ###- In modifying the compiler, I've found it easier to discard the */
  1343. /* typedef, and operate directly on uchar arrays of length DICT_WORD_SIZE. */
  1344. /* In Z-code, DICT_WORD_SIZE will be 6, so the Z-code compiler will work */
  1345. /* as before. In Glulx, it can be any value up to MAX_DICT_WORD_SIZE. */
  1346. /* (That limit is defined as 40 in the header; it exists only for a few */
  1347. /* static buffers, and can be increased without using significant memory.) */
  1348. /* */
  1349. /* ###- Well, that certainly bit me on the butt, didn't it. In further */
  1350. /* modifying the compiler to generate a Unicode dictionary, I have to */
  1351. /* store four-byte values in the uchar array. This is handled by making */
  1352. /* the array size DICT_WORD_BYTES (which is DICT_WORD_SIZE*DICT_CHAR_SIZE).*/
  1353. /* Then we store the 32-bit character value big-endian. This lets us */
  1354. /* continue to compare arrays bytewise, which is a nice simplification. */
  1355. /* ------------------------------------------------------------------------- */
  1356. extern int compare_sorts(uchar *d1, uchar *d2)
  1357. { int i;
  1358. for (i=0; i<DICT_WORD_BYTES; i++)
  1359. if (d1[i]!=d2[i]) return((int)(d1[i]) - (int)(d2[i]));
  1360. /* (since memcmp(d1, d2, DICT_WORD_BYTES); runs into a bug on some Unix
  1361. libraries) */
  1362. return(0);
  1363. }
  1364. extern void copy_sorts(uchar *d1, uchar *d2)
  1365. { int i;
  1366. for (i=0; i<DICT_WORD_BYTES; i++)
  1367. d1[i] = d2[i];
  1368. }
  1369. static uchar prepared_sort[MAX_DICT_WORD_BYTES]; /* Holds the sort code
  1370. of current word */
  1371. static int number_and_case;
  1372. /* Also used by verbs.c */
  1373. static void dictionary_prepare_z(char *dword, uchar *optresult)
  1374. { int i, j, k, k2, wd[13]; int32 tot;
  1375. /* A rapid text translation algorithm using only the simplified rules
  1376. applying to the text of dictionary entries: first produce a sequence
  1377. of 6 (v3) or 9 (v4+) Z-characters */
  1378. number_and_case = 0;
  1379. for (i=0, j=0; dword[j]!=0; i++, j++)
  1380. { if ((dword[j] == '/') && (dword[j+1] == '/'))
  1381. { for (j+=2; dword[j] != 0; j++)
  1382. { switch(dword[j])
  1383. { case 'p': number_and_case |= 4; break;
  1384. default:
  1385. error_named("Expected 'p' after '//' \
  1386. to give number of dictionary word", dword);
  1387. break;
  1388. }
  1389. }
  1390. break;
  1391. }
  1392. if (i>=9) break;
  1393. k=(int) dword[j];
  1394. if (k==(int) '\'')
  1395. warning_named("Obsolete usage: use the ^ character for the \
  1396. apostrophe in", dword);
  1397. if (k==(int) '^') k=(int) '\'';
  1398. if (k=='\"') k='~';
  1399. if (k==(int) '@' || (character_set_unicode && (k & 0x80)))
  1400. { int unicode = text_to_unicode(dword+j);
  1401. if ((unicode < 128) && isupper(unicode)) unicode = tolower(unicode);
  1402. k = unicode_to_zscii(unicode);
  1403. j += textual_form_length - 1;
  1404. if ((k == 5) || (k >= 0x100))
  1405. { unicode_char_error(
  1406. "Character can be printed but not input:", unicode);
  1407. k = '?';
  1408. }
  1409. k2 = zscii_to_alphabet_grid[(uchar) k];
  1410. }
  1411. else
  1412. { if (isupper(k)) k = tolower(k);
  1413. k2 = iso_to_alphabet_grid[(uchar) k];
  1414. }
  1415. if (k2 < 0)
  1416. { if ((k2 == -5) || (k2 <= -0x100))
  1417. char_error("Character can be printed but not input:", k);
  1418. else
  1419. { /* Use 4 more Z-chars to encode a ZSCII escape sequence */
  1420. wd[i++] = 5; wd[i++] = 6;
  1421. k2 = -k2;
  1422. wd[i++] = k2/32; wd[i] = k2%32;
  1423. }
  1424. }
  1425. else
  1426. { alphabet_used[k2] = 'Y';
  1427. if ((k2/26)!=0)
  1428. wd[i++]=3+(k2/26); /* Change alphabet for symbols */
  1429. wd[i]=6+(k2%26); /* Write the Z character */
  1430. }
  1431. }
  1432. /* Fill up to the end of the dictionary block with PAD characters */
  1433. for (; i<9; i++) wd[i]=5;
  1434. /* The array of Z-chars is converted to three 2-byte blocks */
  1435. tot = wd[2] + wd[1]*(1<<5) + wd[0]*(1<<10);
  1436. prepared_sort[1]=tot%0x100;
  1437. prepared_sort[0]=(tot/0x100)%0x100;
  1438. tot = wd[5] + wd[4]*(1<<5) + wd[3]*(1<<10);
  1439. prepared_sort[3]=tot%0x100;
  1440. prepared_sort[2]=(tot/0x100)%0x100;
  1441. tot = wd[8] + wd[7]*(1<<5) + wd[6]*(1<<10);
  1442. prepared_sort[5]=tot%0x100;
  1443. prepared_sort[4]=(tot/0x100)%0x100;
  1444. /* Set the "end bit" on the 2nd (in v3) or the 3rd (v4+) 2-byte block */
  1445. if (version_number==3) prepared_sort[2]+=0x80;
  1446. else prepared_sort[4]+=0x80;
  1447. if (optresult) copy_sorts(optresult, prepared_sort);
  1448. }
  1449. /* Also used by verbs.c */
  1450. static void dictionary_prepare_g(char *dword, uchar *optresult)
  1451. {
  1452. int i, j, k;
  1453. int32 unicode;
  1454. number_and_case = 0;
  1455. for (i=0, j=0; (dword[j]!=0); i++, j++) {
  1456. if ((dword[j] == '/') && (dword[j+1] == '/')) {
  1457. for (j+=2; dword[j] != 0; j++) {
  1458. switch(dword[j]) {
  1459. case 'p':
  1460. number_and_case |= 4;
  1461. break;
  1462. default:
  1463. error_named("Expected 'p' after '//' \
  1464. to give gender or number of dictionary word", dword);
  1465. break;
  1466. }
  1467. }
  1468. break;
  1469. }
  1470. if (i>=DICT_WORD_SIZE) break;
  1471. k= ((unsigned char *)dword)[j];
  1472. if (k=='\'')
  1473. warning_named("Obsolete usage: use the ^ character for the \
  1474. apostrophe in", dword);
  1475. if (k=='^')
  1476. k='\'';
  1477. if (k=='~') /* as in iso_to_alphabet_grid */
  1478. k='\"';
  1479. if (k=='@' || (character_set_unicode && (k & 0x80))) {
  1480. unicode = text_to_unicode(dword+j);
  1481. j += textual_form_length - 1;
  1482. }
  1483. else {
  1484. unicode = iso_to_unicode_grid[k];
  1485. }
  1486. if (DICT_CHAR_SIZE != 1 || (unicode >= 0 && unicode < 256)) {
  1487. k = unicode;
  1488. }
  1489. else {
  1490. error("The dictionary cannot contain Unicode characters beyond Latin-1. \
  1491. Define DICT_CHAR_SIZE=4 for a Unicode-compatible dictionary.");
  1492. k = '?';
  1493. }
  1494. if (k >= (unsigned)'A' && k <= (unsigned)'Z')
  1495. k += ('a' - 'A');
  1496. if (DICT_CHAR_SIZE == 1) {
  1497. prepared_sort[i] = k;
  1498. }
  1499. else {
  1500. prepared_sort[4*i] = (k >> 24) & 0xFF;
  1501. prepared_sort[4*i+1] = (k >> 16) & 0xFF;
  1502. prepared_sort[4*i+2] = (k >> 8) & 0xFF;
  1503. prepared_sort[4*i+3] = (k) & 0xFF;
  1504. }
  1505. }
  1506. if (DICT_CHAR_SIZE == 1) {
  1507. for (; i<DICT_WORD_SIZE; i++)
  1508. prepared_sort[i] = 0;
  1509. }
  1510. else {
  1511. for (; i<DICT_WORD_SIZE; i++) {
  1512. prepared_sort[4*i] = 0;
  1513. prepared_sort[4*i+1] = 0;
  1514. prepared_sort[4*i+2] = 0;
  1515. prepared_sort[4*i+3] = 0;
  1516. }
  1517. }
  1518. if (optresult) copy_sorts(optresult, prepared_sort);
  1519. }
  1520. extern void dictionary_prepare(char *dword, uchar *optresult)
  1521. {
  1522. if (!glulx_mode)
  1523. dictionary_prepare_z(dword, optresult);
  1524. else
  1525. dictionary_prepare_g(dword, optresult);
  1526. }
  1527. /* ------------------------------------------------------------------------- */
  1528. /* The arrays below are all concerned with the problem of alphabetically */
  1529. /* sorting the dictionary during the compilation pass. */
  1530. /* Note that it is not enough simply to apply qsort to the dictionary at */
  1531. /* the end of the pass: we need to ensure that no duplicates are ever */
  1532. /* created. */
  1533. /* */
  1534. /* dict_sort_codes[n] the sort code of record n: i.e., of the nth */
  1535. /* word to be entered into the dictionary, where */
  1536. /* n counts upward from 0 */
  1537. /* (n is also called the "accession number") */
  1538. /* */
  1539. /* The tree structure encodes an ordering. The special value VACANT means */
  1540. /* "no node here": otherwise, node numbers are the same as accession */
  1541. /* numbers. At all times, "root" holds the node number of the top of the */
  1542. /* tree; each node has up to two branches, such that the subtree of the */
  1543. /* left branch is always alphabetically before what's at the node, and */
  1544. /* the subtree to the right is always after; and all branches are coloured */
  1545. /* either "black" or "red". These colours are used to detect points where */
  1546. /* the tree is growing asymmetrically (and therefore becoming inefficient */
  1547. /* to search). */
  1548. /* ------------------------------------------------------------------------- */
  1549. #define RED 'r'
  1550. #define BLACK 'b'
  1551. #define VACANT -1
  1552. static int root;
  1553. typedef struct dict_tree_node_s
  1554. { int branch[2]; /* Branch 0 is "left", 1 is "right" */
  1555. char colour; /* The colour of the branch to the parent */
  1556. } dict_tree_node;
  1557. static dict_tree_node *dtree;
  1558. int *final_dict_order;
  1559. static uchar *dict_sort_codes;
  1560. static void dictionary_begin_pass(void)
  1561. {
  1562. /* Leave room for the 7-byte header (added in "tables.c" much later) */
  1563. /* Glulx has a 4-byte header instead. */
  1564. if (!glulx_mode)
  1565. dictionary_top=dictionary+7;
  1566. else
  1567. dictionary_top=dictionary+4;
  1568. root = VACANT;
  1569. dict_entries = 0;
  1570. }
  1571. static int fdo_count;
  1572. static void recursively_sort(int node)
  1573. { if (dtree[node].branch[0] != VACANT)
  1574. recursively_sort(dtree[node].branch[0]);
  1575. final_dict_order[node] = fdo_count++;
  1576. if (dtree[node].branch[1] != VACANT)
  1577. recursively_sort(dtree[node].branch[1]);
  1578. }
  1579. extern void sort_dictionary(void)
  1580. { int i;
  1581. if (module_switch)
  1582. { for (i=0; i<dict_entries; i++)
  1583. final_dict_order[i] = i;
  1584. return;
  1585. }
  1586. if (root != VACANT)
  1587. { fdo_count = 0; recursively_sort(root);
  1588. }
  1589. }
  1590. /* ------------------------------------------------------------------------- */
  1591. /* If "dword" is in the dictionary, return its accession number plus 1; */
  1592. /* If not, return 0. */
  1593. /* ------------------------------------------------------------------------- */
  1594. static int dictionary_find(char *dword)
  1595. { int at = root, n;
  1596. dictionary_prepare(dword, NULL);
  1597. while (at != VACANT)
  1598. { n = compare_sorts(prepared_sort, dict_sort_codes+at*DICT_WORD_BYTES);
  1599. if (n==0) return at + 1;
  1600. if (n>0) at = dtree[at].branch[1]; else at = dtree[at].branch[0];
  1601. }
  1602. return 0;
  1603. }
  1604. /* ------------------------------------------------------------------------- */
  1605. /* Add "dword" to the dictionary with (x,y,z) as its data fields; unless */
  1606. /* it already exists, in which case OR the data with (x,y,z) */
  1607. /* */
  1608. /* These fields are one byte each in Z-code, two bytes each in Glulx. */
  1609. /* */
  1610. /* Returns: the accession number. */
  1611. /* ------------------------------------------------------------------------- */
  1612. extern int dictionary_add(char *dword, int x, int y, int z)
  1613. { int n; uchar *p;
  1614. int ggfr = 0, gfr = 0, fr = 0, r = 0;
  1615. int ggf = VACANT, gf = VACANT, f = VACANT, at = root;
  1616. int a, b;
  1617. int res=((version_number==3)?4:6);
  1618. dictionary_prepare(dword, NULL);
  1619. if (root == VACANT)
  1620. { root = 0; goto CreateEntry;
  1621. }
  1622. while (TRUE)
  1623. {
  1624. n = compare_sorts(prepared_sort, dict_sort_codes+at*DICT_WORD_BYTES);
  1625. if (n==0)
  1626. {
  1627. if (!glulx_mode) {
  1628. p = dictionary+7 + at*(3+res) + res;
  1629. p[0]=(p[0])|x; p[1]=(p[1])|y; p[2]=(p[2])|z;
  1630. if (x & 128) p[0] = (p[0])|number_and_case;
  1631. }
  1632. else {
  1633. p = dictionary+4 + at*DICT_ENTRY_BYTE_LENGTH + DICT_ENTRY_FLAG_POS;
  1634. p[0]=(p[0])|(x/256); p[1]=(p[1])|(x%256);
  1635. p[2]=(p[2])|(y/256); p[3]=(p[3])|(y%256);
  1636. p[4]=(p[4])|(z/256); p[5]=(p[5])|(z%256);
  1637. if (x & 128) p[1] = (p[1]) | number_and_case;
  1638. }
  1639. return at;
  1640. }
  1641. if (n>0) r=1; else r=0;
  1642. a = dtree[at].branch[0]; b = dtree[at].branch[1];
  1643. if ((a != VACANT) && (dtree[a].colour == RED) &&
  1644. (b != VACANT) && (dtree[b].colour == RED))
  1645. { dtree[a].colour = BLACK;
  1646. dtree[b].colour = BLACK;
  1647. dtree[at].colour = RED;
  1648. /* A tree rotation may be needed to avoid two red links in a row:
  1649. e.g.
  1650. ggf (or else gf is root) ggf (or f is root)
  1651. | |
  1652. gf f
  1653. / \(red) / \ (both red)
  1654. f becomes gf at
  1655. / \(red) / \ / \
  1656. at
  1657. / \
  1658. In effect we rehang the "gf" subtree from "f".
  1659. See the Technical Manual for further details.
  1660. */
  1661. if ((f != VACANT) && (gf != VACANT) && (dtree[f].colour == RED))
  1662. {
  1663. if (fr == gfr)
  1664. { if (ggf == VACANT) root = f; else dtree[ggf].branch[ggfr] = f;
  1665. dtree[gf].branch[gfr] = dtree[f].branch[1-fr];
  1666. dtree[f].branch[1-fr] = gf;
  1667. dtree[f].colour = BLACK;
  1668. dtree[gf].colour = RED;
  1669. gf = ggf; gfr = ggfr;
  1670. }
  1671. else
  1672. { if (ggf == VACANT) root = at; else dtree[ggf].branch[ggfr] = at;
  1673. dtree[at].colour = BLACK;
  1674. dtree[gf].colour = RED;
  1675. dtree[f].branch[fr] = dtree[at].branch[gfr];
  1676. dtree[gf].branch[gfr] = dtree[at].branch[fr];
  1677. dtree[at].branch[gfr] = f;
  1678. dtree[at].branch[fr] = gf;
  1679. r = 1-r; n = at; if (r==fr) at = f; else at = gf;
  1680. f = n; gf = ggf; fr = 1-r; gfr = ggfr;
  1681. }
  1682. }
  1683. }
  1684. if (dtree[at].branch[r] == VACANT)
  1685. { dtree[at].colour = RED;
  1686. if ((f != VACANT) && (gf != VACANT) && (dtree[f].colour == RED))
  1687. { if (fr == gfr)
  1688. { if (ggf == VACANT) root = f; else dtree[ggf].branch[ggfr] = f;
  1689. dtree[gf].branch[gfr] = dtree[f].branch[1-fr];
  1690. dtree[f].branch[1-fr] = gf;
  1691. dtree[f].colour = BLACK;
  1692. dtree[gf].colour = RED;
  1693. }
  1694. else
  1695. { if (ggf == VACANT) root = at; else dtree[ggf].branch[ggfr] = at;
  1696. dtree[at].colour = BLACK;
  1697. dtree[gf].colour = RED;
  1698. dtree[f].branch[fr] = dtree[at].branch[gfr];
  1699. dtree[gf].branch[gfr] = dtree[at].branch[fr];
  1700. dtree[at].branch[gfr] = f;
  1701. dtree[at].branch[fr] = gf;
  1702. r = 1-r; n = at; if (r==fr) at = f; else at = gf;
  1703. f = n; gf = ggf;
  1704. }
  1705. }
  1706. dtree[at].branch[r] = dict_entries;
  1707. goto CreateEntry;
  1708. }
  1709. ggf = gf; gf = f; f = at; at = dtree[at].branch[r];
  1710. ggfr = gfr; gfr = fr; fr = r;
  1711. }
  1712. CreateEntry:
  1713. if (dict_entries==MAX_DICT_ENTRIES)
  1714. memoryerror("MAX_DICT_ENTRIES",MAX_DICT_ENTRIES);
  1715. dtree[dict_entries].branch[0] = VACANT;
  1716. dtree[dict_entries].branch[1] = VACANT;
  1717. dtree[dict_entries].colour = BLACK;
  1718. /* Address in Inform's own dictionary table to write the record to */
  1719. if (!glulx_mode) {
  1720. p = dictionary + (3+res)*dict_entries + 7;
  1721. /* So copy in the 4 (or 6) bytes of Z-coded text and the 3 data
  1722. bytes */
  1723. p[0]=prepared_sort[0]; p[1]=prepared_sort[1];
  1724. p[2]=prepared_sort[2]; p[3]=prepared_sort[3];
  1725. if (version_number > 3)
  1726. { p[4]=prepared_sort[4]; p[5]=prepared_sort[5]; }
  1727. p[res]=x; p[res+1]=y; p[res+2]=z;
  1728. if (x & 128) p[res] = (p[res])|number_and_case;
  1729. dictionary_top += res+3;
  1730. }
  1731. else {
  1732. int i;
  1733. p = dictionary + 4 + DICT_ENTRY_BYTE_LENGTH*dict_entries;
  1734. p[0] = 0x60; /* type byte -- dict word */
  1735. p += DICT_CHAR_SIZE;
  1736. for (i=0; i<DICT_WORD_BYTES; i++)
  1737. p[i] = prepared_sort[i];
  1738. p += DICT_WORD_BYTES;
  1739. p[0] = 0; p[1] = x;
  1740. p[2] = y/256; p[3] = y%256;
  1741. p[4] = 0; p[5] = z;
  1742. if (x & 128)
  1743. p[1] |= number_and_case;
  1744. dictionary_top += DICT_ENTRY_BYTE_LENGTH;
  1745. }
  1746. copy_sorts(dict_sort_codes+dict_entries*DICT_WORD_BYTES, prepared_sort);
  1747. return dict_entries++;
  1748. }
  1749. /* ------------------------------------------------------------------------- */
  1750. /* Used in "tables.c" for "Extend ... only", to renumber a verb-word to a */
  1751. /* new verb syntax of its own. (Otherwise existing verb-words never */
  1752. /* change their verb-numbers.) */
  1753. /* ------------------------------------------------------------------------- */
  1754. extern void dictionary_set_verb_number(char *dword, int to)
  1755. { int i; uchar *p;
  1756. int res=((version_number==3)?4:6);
  1757. i=dictionary_find(dword);
  1758. if (i!=0)
  1759. {
  1760. if (!glulx_mode) {
  1761. p=dictionary+7+(i-1)*(3+res)+res;
  1762. p[1]=to;
  1763. }
  1764. else {
  1765. p=dictionary+4 + (i-1)*DICT_ENTRY_BYTE_LENGTH + DICT_ENTRY_FLAG_POS;
  1766. p[2]=to/256; p[3]=to%256;
  1767. }
  1768. }
  1769. }
  1770. /* ------------------------------------------------------------------------- */
  1771. /* Tracing code for the dictionary: used not only by "trace" and text */
  1772. /* transcription, but also (in the case of "word_to_ascii") in a vital */
  1773. /* by the linker. */
  1774. /* ------------------------------------------------------------------------- */
  1775. static char *d_show_to;
  1776. static int d_show_total;
  1777. static void show_char(char c)
  1778. { if (d_show_to == NULL) printf("%c", c);
  1779. else
  1780. { int i = strlen(d_show_to);
  1781. d_show_to[i] = c; d_show_to[i+1] = 0;
  1782. }
  1783. }
  1784. extern void word_to_ascii(uchar *p, char *results)
  1785. { int i, shift, cc, zchar; uchar encoded_word[9];
  1786. encoded_word[0] = (((int) p[0])&0x7c)/4;
  1787. encoded_word[1] = 8*(((int) p[0])&0x3) + (((int) p[1])&0xe0)/32;
  1788. encoded_word[2] = ((int) p[1])&0x1f;
  1789. encoded_word[3] = (((int) p[2])&0x7c)/4;
  1790. encoded_word[4] = 8*(((int) p[2])&0x3) + (((int) p[3])&0xe0)/32;
  1791. encoded_word[5] = ((int) p[3])&0x1f;
  1792. if (version_number > 3)
  1793. { encoded_word[6] = (((int) p[4])&0x7c)/4;
  1794. encoded_word[7] = 8*(((int) p[4])&0x3) + (((int) p[5])&0xe0)/32;
  1795. encoded_word[8] = ((int) p[5])&0x1f;
  1796. }
  1797. shift = 0; cc = 0;
  1798. for (i=0; i< ((version_number==3)?6:9); i++)
  1799. { zchar = encoded_word[i];
  1800. if (zchar == 4) shift = 1;
  1801. else
  1802. if (zchar == 5) shift = 2;
  1803. else
  1804. { if ((shift == 2) && (zchar == 6))
  1805. { zchar = 32*encoded_word[i+1] + encoded_word[i+2];
  1806. i += 2;
  1807. if ((zchar>=32) && (zchar<=126))
  1808. results[cc++] = zchar;
  1809. else
  1810. { zscii_to_text(results+cc, zchar);
  1811. cc = strlen(results);
  1812. }
  1813. }
  1814. else
  1815. { zscii_to_text(results+cc, (alphabet[shift])[zchar-6]);
  1816. cc = strlen(results);
  1817. }
  1818. shift = 0;
  1819. }
  1820. }
  1821. results[cc] = 0;
  1822. }
  1823. static void recursively_show_z(int node)
  1824. { int i, cprinted, flags; uchar *p;
  1825. char textual_form[32];
  1826. int res = (version_number == 3)?4:6;
  1827. if (dtree[node].branch[0] != VACANT)
  1828. recursively_show_z(dtree[node].branch[0]);
  1829. p = (uchar *)dictionary + 7 + (3+res)*node;
  1830. word_to_ascii(p, textual_form);
  1831. for (cprinted = 0; textual_form[cprinted]!=0; cprinted++)
  1832. show_char(textual_form[cprinted]);
  1833. for (; cprinted < 4 + ((version_number==3)?6:9); cprinted++)
  1834. show_char(' ');
  1835. if (d_show_to == NULL)
  1836. { for (i=0; i<3+res; i++) printf("%02x ",p[i]);
  1837. flags = (int) p[res];
  1838. if (flags & 128)
  1839. { printf("noun ");
  1840. if (flags & 4) printf("p"); else printf(" ");
  1841. printf(" ");
  1842. }
  1843. else printf(" ");
  1844. if (flags & 8)
  1845. { if (grammar_version_number == 1)
  1846. printf("preposition:%d ", (int) p[res+2]);
  1847. else
  1848. printf("preposition ");
  1849. }
  1850. if ((flags & 3) == 3) printf("metaverb:%d ", (int) p[res+1]);
  1851. else if ((flags & 3) == 1) printf("verb:%d ", (int) p[res+1]);
  1852. printf("\n");
  1853. }
  1854. if (d_show_total++ == 5)
  1855. { d_show_total = 0;
  1856. if (d_show_to != NULL)
  1857. { write_to_transcript_file(d_show_to);
  1858. d_show_to[0] = 0;
  1859. }
  1860. }
  1861. if (dtree[node].branch[1] != VACANT)
  1862. recursively_show_z(dtree[node].branch[1]);
  1863. }
  1864. static void recursively_show_g(int node)
  1865. {
  1866. warning("### Glulx dictionary-show not yet implemented.\n");
  1867. }
  1868. static void show_alphabet(int i)
  1869. { int j, c; char chartext[8];
  1870. for (j=0; j<26; j++)
  1871. { c = alphabet[i][j];
  1872. if (alphabet_used[26*i+j] == 'N') printf("("); else printf(" ");
  1873. zscii_to_text(chartext, c);
  1874. printf("%s", chartext);
  1875. if (alphabet_used[26*i+j] == 'N') printf(")"); else printf(" ");
  1876. }
  1877. printf("\n");
  1878. }
  1879. extern void show_dictionary(void)
  1880. { printf("Dictionary contains %d entries:\n",dict_entries);
  1881. if (dict_entries != 0)
  1882. { d_show_total = 0; d_show_to = NULL;
  1883. if (!glulx_mode)
  1884. recursively_show_z(root);
  1885. else
  1886. recursively_show_g(root);
  1887. }
  1888. printf("\nZ-machine alphabet entries:\n");
  1889. show_alphabet(0);
  1890. show_alphabet(1);
  1891. show_alphabet(2);
  1892. }
  1893. extern void write_dictionary_to_transcript(void)
  1894. { char d_buffer[81];
  1895. sprintf(d_buffer, "\n[Dictionary contains %d entries:]\n", dict_entries);
  1896. d_buffer[0] = 0; write_to_transcript_file(d_buffer);
  1897. if (dict_entries != 0)
  1898. { d_show_total = 0; d_show_to = d_buffer;
  1899. if (!glulx_mode)
  1900. recursively_show_z(root);
  1901. else
  1902. recursively_show_g(root);
  1903. }
  1904. if (d_show_total != 0) write_to_transcript_file(d_buffer);
  1905. }
  1906. /* ========================================================================= */
  1907. /* Data structure management routines */
  1908. /* ------------------------------------------------------------------------- */
  1909. extern void init_text_vars(void)
  1910. { int j;
  1911. bestyet = NULL;
  1912. bestyet2 = NULL;
  1913. tlbtab = NULL;
  1914. grandtable = NULL;
  1915. grandflags = NULL;
  1916. no_chars_transcribed = 0;
  1917. is_abbreviation = FALSE;
  1918. put_strings_in_low_memory = FALSE;
  1919. for (j=0; j<256; j++) abbrevs_lookup[j] = -1;
  1920. total_zchars_trans = 0;
  1921. dtree = NULL;
  1922. final_dict_order = NULL;
  1923. dict_sort_codes = NULL;
  1924. dict_entries=0;
  1925. initialise_memory_block(&static_strings_area);
  1926. }
  1927. extern void text_begin_pass(void)
  1928. { abbrevs_lookup_table_made = FALSE;
  1929. no_abbreviations=0;
  1930. total_chars_trans=0; total_bytes_trans=0;
  1931. if (store_the_text) all_text_top=all_text;
  1932. dictionary_begin_pass();
  1933. low_strings_top = low_strings;
  1934. static_strings_extent = 0;
  1935. no_strings = 0;
  1936. no_dynamic_strings = 0;
  1937. no_unicode_chars = 0;
  1938. }
  1939. /* Note: for allocation and deallocation of all_the_text, see inform.c */
  1940. extern void text_allocate_arrays(void)
  1941. { abbreviations_at = my_malloc(MAX_ABBREVS*MAX_ABBREV_LENGTH,
  1942. "abbreviations");
  1943. abbrev_values = my_calloc(sizeof(int), MAX_ABBREVS, "abbrev values");
  1944. abbrev_quality = my_calloc(sizeof(int), MAX_ABBREVS, "abbrev quality");
  1945. abbrev_freqs = my_calloc(sizeof(int), MAX_ABBREVS, "abbrev freqs");
  1946. dtree = my_calloc(sizeof(dict_tree_node), MAX_DICT_ENTRIES,
  1947. "red-black tree for dictionary");
  1948. final_dict_order = my_calloc(sizeof(int), MAX_DICT_ENTRIES,
  1949. "final dictionary ordering table");
  1950. dict_sort_codes = my_calloc(DICT_WORD_BYTES, MAX_DICT_ENTRIES,
  1951. "dictionary sort codes");
  1952. if (!glulx_mode)
  1953. dictionary = my_malloc(9*MAX_DICT_ENTRIES+7,
  1954. "dictionary");
  1955. else
  1956. dictionary = my_malloc(DICT_ENTRY_BYTE_LENGTH*MAX_DICT_ENTRIES+4,
  1957. "dictionary");
  1958. strings_holding_area
  1959. = my_malloc(MAX_STATIC_STRINGS,"static strings holding area");
  1960. low_strings = my_malloc(MAX_LOW_STRINGS,"low (abbreviation) strings");
  1961. huff_entities = NULL;
  1962. hufflist = NULL;
  1963. unicode_usage_entries = NULL;
  1964. done_compression = FALSE;
  1965. compression_table_size = 0;
  1966. compressed_offsets = NULL;
  1967. MAX_CHARACTER_SET = 0;
  1968. if (glulx_mode) {
  1969. if (compression_switch) {
  1970. int ix;
  1971. MAX_CHARACTER_SET = 257 + MAX_ABBREVS + MAX_DYNAMIC_STRINGS
  1972. + MAX_UNICODE_CHARS;
  1973. huff_entities = my_calloc(sizeof(huffentity_t), MAX_CHARACTER_SET*2+1,
  1974. "huffman entities");
  1975. hufflist = my_calloc(sizeof(huffentity_t *), MAX_CHARACTER_SET,
  1976. "huffman node list");
  1977. unicode_usage_entries = my_calloc(sizeof(unicode_usage_t),
  1978. MAX_UNICODE_CHARS, "unicode entity entries");
  1979. for (ix=0; ix<UNICODE_HASH_BUCKETS; ix++)
  1980. unicode_usage_hash[ix] = NULL;
  1981. }
  1982. compressed_offsets = my_calloc(sizeof(int32), MAX_NUM_STATIC_STRINGS,
  1983. "static strings index table");
  1984. }
  1985. }
  1986. extern void text_free_arrays(void)
  1987. {
  1988. my_free(&strings_holding_area, "static strings holding area");
  1989. my_free(&low_strings, "low (abbreviation) strings");
  1990. my_free(&abbreviations_at, "abbreviations");
  1991. my_free(&abbrev_values, "abbrev values");
  1992. my_free(&abbrev_quality, "abbrev quality");
  1993. my_free(&abbrev_freqs, "abbrev freqs");
  1994. my_free(&dtree, "red-black tree for dictionary");
  1995. my_free(&final_dict_order, "final dictionary ordering table");
  1996. my_free(&dict_sort_codes, "dictionary sort codes");
  1997. my_free(&dictionary,"dictionary");
  1998. my_free(&compressed_offsets, "static strings index table");
  1999. my_free(&hufflist, "huffman node list");
  2000. my_free(&huff_entities, "huffman entities");
  2001. my_free(&unicode_usage_entries, "unicode entity entities");
  2002. deallocate_memory_block(&static_strings_area);
  2003. }
  2004. extern void ao_free_arrays(void)
  2005. { my_free (&tlbtab,"tlb table");
  2006. my_free (&sub_buffer,"sub_buffer");
  2007. my_free (&bestyet,"bestyet");
  2008. my_free (&bestyet2,"bestyet2");
  2009. my_free (&grandtable,"grandtable");
  2010. my_free (&grandflags,"grandflags");
  2011. }
  2012. /* ========================================================================= */