main.cpp 4.0 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228
  1. #include <iostream>
  2. #include <fstream>
  3. #include <list>
  4. #include <unordered_map>
  5. #include <string>
  6. #include <ctype.h>
  7. #include <string.h>
  8. #include "pos.hpp"
  9. using namespace std;
  10. typedef list<string*> Sentence;
  11. void checkCaps(Sentence* sent);
  12. bool word_char(uint32_t c) {
  13. if(c == ' ') return false;
  14. return
  15. // isalnum(c) ||
  16. isupper(c) ||
  17. islower(c) ||
  18. c == '-' ||
  19. c == '$' ||
  20. c == '@' ||
  21. c == '#' ||
  22. c == '%' ||
  23. c == '_'
  24. ;
  25. }
  26. bool is_whitespace(uint32_t c) {
  27. return c == ' ' || c == '\t' || c == '\r' || c == '\n';
  28. }
  29. bool ends_word(uint32_t c) {
  30. return !(isalnum(c) || c == '-');
  31. }
  32. bool ends_sentence(uint32_t c) {
  33. return
  34. c == '.' ||
  35. c == '!' ||
  36. c == '?';
  37. }
  38. // punctuation that can occur inside a sentence
  39. bool interior_punct(uint32_t c) {
  40. return
  41. c == ',' ||
  42. c == ';' ||
  43. // c == '-' ||
  44. c == '\'' ||
  45. c == '"' ||
  46. c == '(' ||
  47. c == ')' ||
  48. c == ':';
  49. }
  50. void load_POS(string path, unordered_map<string, POS> dict);
  51. int main(int argc, char* argv[]) {
  52. list<string*>* sentence = new list<string*>;
  53. unordered_map<string, POS> dict;
  54. load_POS("./part-of-speech.txt", dict);
  55. cout << "POS db loaded\n";
  56. char buffer[256];
  57. int i = 0;
  58. bool in_word = false;
  59. while(1) {
  60. char c;
  61. cin.get(c);
  62. if(cin.eof()) {
  63. break;
  64. }
  65. if(word_char(c)) {
  66. buffer[i++] = c;
  67. in_word = true;
  68. // cout << "+ '" << c << "'";
  69. }
  70. else {
  71. // cout << "-" << endl;
  72. if(in_word) {
  73. // cycle the word
  74. string* s = new string(buffer, i);
  75. // cout << "word: '" << *s << "'\n";
  76. sentence->push_back(s);
  77. i = 0;
  78. in_word = false;
  79. }
  80. if(!is_whitespace(c)) {
  81. // push punctuation
  82. sentence->push_back(new string(&c, 1));
  83. }
  84. // else ignore whitespace
  85. if(ends_sentence(c)) {
  86. // cout << "sentence ended\n";
  87. for(const string* s : *sentence) {
  88. cout << *s << " ";
  89. }
  90. cout << endl;
  91. checkCaps(sentence);
  92. for(const string* s : *sentence) delete s;
  93. delete sentence;
  94. sentence = new list<string*>;
  95. }
  96. }
  97. }
  98. }
  99. void checkCaps(Sentence* sent) {
  100. if(sent->size() == 0) return;
  101. string* s = sent->front();
  102. char c = (*s)[0];
  103. cout << c << endl;
  104. if(islower(c)) {
  105. cout << "sentence is not capitalized.\n";
  106. }
  107. }
  108. /*
  109. N Noun
  110. P Plural
  111. h Noun Phrase
  112. V Verb (usu participle)
  113. t Verb (transitive)
  114. i Verb (intransitive)
  115. A Adjective
  116. v Adverb
  117. C Conjunction
  118. p Preposition
  119. ! Interjection
  120. r Pronoun
  121. D Definite Article
  122. I Indefinite Article
  123. o Nominative
  124. */
  125. uint32_t masks = {
  126. #define POS_NOUN_MASK 0b00000000 00000000 00000000 00000001
  127. #define POS_PLURAL_MASK 0b00000000 00000000 00000000 00000010
  128. #define POS_NOUN_PHR_MASK 0b00000000 00000000 00000000 00000100
  129. #define POS_VERB_PART_MASK 0b00000000 00000000 00000000 00001000
  130. #define POS_VERB_TRAN_MASK 0b00000000 00000000 00000000 00010000
  131. #define POS_VERB_INT_MASK 0b00000000 00000000 00000000 00100000
  132. #define POS_ADJECTIVE_MASK 0b00000000 00000000 00000000 01000000
  133. #define POS_ADVERB_MASK 0b00000000 00000000 00000000 10000000
  134. #define POS_CONJUNCTION_MASK 0b00000000 00000000 00000001 00000000
  135. #define POS_PREPOSITION_MASK 0b00000000 00000000 00000010 00000000
  136. #define POS_INTERJECTION_MASK 0b00000000 00000000 00000100 00000000
  137. #define POS_PRONOUN_MASK 0b00000000 00000000 00001000 00000000
  138. #define POS_ART_DEF_MASK 0b00000000 00000000 00010000 00000000
  139. #define POS_ART_IND_MASK 0b00000000 00000000 00100000 00000000
  140. #define POS_NOMINATIVE_MASK 0b00000000 00000000 01000000 00000000
  141. };
  142. void load_POS(string path, unordered_map<string, POS> dict) {
  143. // rad the file
  144. ifstream f(path.c_str(), ifstream::in);
  145. // um... failure?
  146. f.seekg(0, ios::end);
  147. size_t size = f.tellg();
  148. f.seekg(0);
  149. char* source = new char[size+1];
  150. f.read(source, size);
  151. source[size] = 0;
  152. // process the contents
  153. char* s = source;
  154. while(*s) {
  155. char* end = strchr(s, '\t');
  156. if(!end) break;
  157. string word(s, end - s);
  158. // cout << word << endl;
  159. dict.insert({word, POS(end)});
  160. s = strchr(end, '\n') + 1;
  161. }
  162. }