preprocess.c 7.4 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323
  1. #include "defs.h"
  2. #include "utils.h"
  3. int main(int argc, char* argv[]) {
  4. string_internment_table_init(&global_string_internment_table);
  5. srand(argc > 1 ? atoi(argv[1]) : 0);
  6. book_info* bi = calloc(1, sizeof(*bi));
  7. HT_init(&bi->word_lookup, 1024 * 16);
  8. char* src = read_whole_file("watwe.txt", NULL);
  9. bi->words_alloc = 1024 * 8;
  10. bi->words_len = 0;
  11. bi->words = calloc(1, sizeof(*bi->words) * bi->words_alloc);
  12. bi->word_list_alloc = 1024 * 256;
  13. bi->word_list_len = 0;
  14. bi->word_list = calloc(1, sizeof(*bi->word_list) * bi->word_list_alloc);
  15. bi->sentence_list_alloc = 1024 * 8;
  16. bi->sentence_list_len = 0;
  17. bi->sentence_list = calloc(1, sizeof(*bi->sentence_list) * bi->sentence_list_alloc);
  18. bi->paragraph_list_alloc = 1024 * 4;
  19. bi->paragraph_list_len = 0;
  20. bi->paragraph_list = calloc(1, sizeof(*bi->paragraph_list) * bi->paragraph_list_alloc);
  21. bi->chapter_list_alloc = 256;
  22. bi->chapter_list_len = 0;
  23. bi->chapter_list = calloc(1, sizeof(*bi->chapter_list) * bi->chapter_list_alloc);
  24. chapter_info* chapter = bi->chapter_list;
  25. paragraph_info* paragraph = bi->paragraph_list;
  26. sentence_info* sentence = bi->sentence_list;
  27. sentence->word_list = bi->word_list + bi->word_list_len;
  28. // strtolower(src);
  29. int words_into_chapter = 0;
  30. int words_into_paragraph = 0;
  31. int words_into_sentence = 0;
  32. int sentences_into_chapter = 0;
  33. int sentences_into_paragraph = 0;
  34. int paragraphs_into_chapter = 0;
  35. int chapters_into_book = 0;
  36. char* s = src;
  37. while(*s) {
  38. if(s[0] == '\n' && s[1] == '\n') {
  39. paragraph++;
  40. bi->paragraph_list_len++;
  41. paragraph->num_words = words_into_paragraph;
  42. paragraph->num_sentences = sentences_into_paragraph;
  43. paragraph->start_word = bi->word_list_len;
  44. paragraph->start_sentence = bi->sentence_list_len;
  45. words_into_sentence = 0;
  46. words_into_paragraph = 0;
  47. sentences_into_paragraph = 0;
  48. paragraphs_into_chapter++;
  49. while(*s == '\n') s++;
  50. }
  51. if(isspace(*s)) {
  52. s++;
  53. continue;
  54. }
  55. if(strchr(",\";:", *s)) {
  56. s++;
  57. continue;
  58. }
  59. if(strchr(".?!", *s)) {
  60. sentence++;
  61. bi->sentence_list_len++;
  62. words_into_sentence = 0;
  63. sentences_into_chapter++;
  64. sentences_into_paragraph++;
  65. sentence->start_word = bi->word_list_len;
  66. sentence->word_list = bi->word_list + bi->word_list_len;
  67. s++;
  68. continue;
  69. }
  70. if(isalnum(*s)) {
  71. if(*s == 'C' && s[1] == 'H' && !strncmp(s, "CHAPTER ", strlen("CHAPTER "))) {
  72. chapters_into_book++;
  73. chapter++;
  74. bi->chapter_list_len++;
  75. words_into_sentence = 0;
  76. words_into_paragraph = 0;
  77. words_into_chapter = 0;
  78. sentences_into_chapter = 0;
  79. sentences_into_paragraph = 0;
  80. paragraphs_into_chapter = 0;
  81. chapter->start_word = bi->word_list_len;
  82. chapter->start_sentence = bi->sentence_list_len;
  83. chapter->start_paragraph = bi->paragraph_list_len;
  84. while(*s != '\n') s++; // skip the "chapter" line
  85. s++;
  86. while(*s != '\n') s++; // skip the next blank line
  87. s++;
  88. while(*s != '\n') s++; // skip the teaser line
  89. s++;
  90. continue;
  91. }
  92. if(*s == 'B' && s[1] == 'O' && !strncmp(s, "BOOK ", strlen("BOOK "))) {
  93. words_into_sentence = 0;
  94. words_into_paragraph = 0;
  95. words_into_chapter = 0;
  96. sentences_into_chapter = 0;
  97. sentences_into_paragraph = 0;
  98. paragraphs_into_chapter = 0;
  99. while(*s != '\n') s++; // skip the "book" line
  100. s++;
  101. while(*s != '\n') s++; // skip the next blank line
  102. s++;
  103. while(*s != '\n') s++; // skip the teaser line
  104. s++;
  105. continue;
  106. }
  107. char* begin = s;
  108. while(isalnum(*s) || *s == '\'') s++;
  109. int len = s - begin;
  110. char* w = strtolower(strnint(begin, len));
  111. word_stats* st;
  112. if(HT_get(&bi->word_lookup, w, &st)) {
  113. st = bi->words + bi->words_len;
  114. HT_set(&bi->word_lookup, w, st);
  115. st->text = w;
  116. st->ordinal = bi->words_len++;
  117. }
  118. st->count++;
  119. bi->word_list[bi->word_list_len++] = st->ordinal;
  120. if(words_into_sentence == 0) st->starts_sentence++;
  121. if(words_into_paragraph == 0) st->starts_paragraph++;
  122. if(words_into_chapter == 0) st->starts_chapter++;
  123. if(words_into_sentence == 0) printf("\n%d:%d ", chapters_into_book, sentences_into_paragraph);
  124. if(words_into_sentence < 4) printf("%d(%s) ", words_into_sentence, w);
  125. sentence->num_words++;
  126. paragraph->num_words++;
  127. chapter->num_words++;
  128. words_into_sentence++;
  129. words_into_paragraph++;
  130. words_into_chapter++;
  131. continue;
  132. }
  133. s++;
  134. }
  135. // shorthands and settings
  136. int nwords = bi->words_len;
  137. float dist_factor = 1;
  138. // calculate the probability of a word following other words.
  139. FOR(wi, bi->words_len) {
  140. bi->words[wi].follows = calloc(1, sizeof(*bi->words[wi].follows) * nwords);
  141. }
  142. sentence_info* s0 = bi->sentence_list; // this sentence
  143. sentence_info* s1 = bi->sentence_list; // prior sentence
  144. sentence_info* s2 = bi->sentence_list; // two sentences ago
  145. FOR(si, bi->sentence_list_len) {
  146. FOR(wi, s0->num_words) {
  147. word_stats* word = &bi->words[s0->word_list[wi]];
  148. if(s2 != s1) {
  149. FOR(w2i, s2->num_words) {
  150. int dist = wi + s0->start_word - s2->start_word - w2i;
  151. word->follows[s2->word_list[w2i]] += (dist_factor / dist);
  152. }
  153. }
  154. if(s1 != s0) {
  155. FOR(w1i, s1->num_words) {
  156. int dist = wi + s0->start_word - s1->start_word - w1i;
  157. word->follows[s1->word_list[w1i]] += (dist_factor / dist);
  158. }
  159. }
  160. FOR(w0i, wi - 1) {
  161. int dist = wi /*+ s0->start_word - s0->start_word */ - w0i;
  162. word->follows[s0->word_list[w0i]] += (dist_factor / dist);
  163. }
  164. }
  165. s2 = s1;
  166. s1 = s0;
  167. s0++;
  168. }
  169. // normlize the probabilities
  170. FOR(wi, nwords) {
  171. float total = 0;
  172. FOR(i, nwords) total += bi->words[wi].follows[i];
  173. total = 1.0 / total;
  174. FOR(i, nwords) bi->words[wi].follows[i] *= total;
  175. }
  176. // collect a list of unique words in each sentence
  177. // this list is used as an index later
  178. int most_uniques = 0;
  179. sentence = bi->sentence_list;
  180. FOR(si, bi->sentence_list_len) {
  181. FOR(wi, sentence->num_words) {
  182. if(!word_exists_in_sentence_prior_to(sentence, wi)) sentence->num_unique_words++;
  183. }
  184. sentence->unique_word_list = calloc(1, sizeof(*sentence->unique_word_list) * sentence->num_unique_words);
  185. int n = 0;
  186. FOR(wi, sentence->num_words) {
  187. if(!word_exists_in_sentence_prior_to(sentence, wi)) {
  188. sentence->unique_word_list[n++] = sentence->word_list[wi];
  189. }
  190. }
  191. most_uniques = MAX(most_uniques, sentence->num_unique_words);
  192. sort_words(sentence->unique_word_list, sentence->num_unique_words);
  193. /*
  194. FOR(a, sentence->num_unique_words) {
  195. printf("%d:%s ", sentence->unique_word_list[a], bi->words[sentence->unique_word_list[a]].text);
  196. }
  197. printf("\n\n");
  198. */
  199. sentence++;
  200. }
  201. sentence = bi->sentence_list;
  202. FOR(si, bi->sentence_list_len) {
  203. FOR(wi, sentence->num_words - 1) {
  204. if(!word_exists_in_sentence_prior_to(sentence, wi)) sentence->num_unique_words++;
  205. }
  206. sentence++;
  207. }
  208. printf("\n");
  209. printf("unique word count: %ld\n", bi->word_lookup.base.fill);
  210. printf("total word count: %ld\n", bi->word_list_len);
  211. printf("total sentence count: %ld\n", bi->sentence_list_len);
  212. printf("total paragraph count: %ld\n", bi->paragraph_list_len);
  213. printf("total chapter count: %ld\n", bi->chapter_list_len);
  214. int sstarters = 0;
  215. HT_EACH(&bi->word_lookup, k, word_stats*, st) {
  216. if(st->starts_sentence > 0) sstarters++;
  217. }
  218. printf("sentence start words: %d\n", sstarters);
  219. printf("most unique words in a sentence: %d\n", most_uniques);
  220. }