parse.h 7.5 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241
  1. #include <stdio.h>
  2. #include <string.h>
  3. /* #include <sys/stat.h> /\* getting file size *\/ */
  4. #include <stdlib.h> /* calloc */
  5. #include <stdbool.h>
  6. #include <regex.h>
  7. #include <assert.h>
  8. #include "types.h"
  9. #define MAX_LENGTH_ELEMENT_NAME 15
  10. void *
  11. xmalloc (size_t size)
  12. {
  13. void *value = malloc (size);
  14. assert(value != NULL);
  15. /* if (value == 0) */
  16. /* fatal ("virtual memory exhausted"); */
  17. return value;
  18. }
  19. elementptr
  20. allocate_new_element ()
  21. {
  22. //FIXME why do I need to say struct element here?
  23. elementptr element = (elementptr) xmalloc (sizeof (struct element));
  24. element->done_parsing = false;
  25. /* initialize the element's pointer's to NULL */
  26. element->child = element->younger_sibling =
  27. element->older_sibling = NULL;
  28. element->contents = NULL;
  29. element->attribute = NULL;
  30. return element;
  31. }
  32. /*
  33. * This function returns a newly registered element. The returned element has
  34. * the struct members defined. ie: element->name = <some string>
  35. */
  36. elementptr
  37. register_new_element (elementptr element, char * token,
  38. bool is_one_liner_element)
  39. {
  40. elementptr temp_element = allocate_new_element();
  41. //if the current element doesn't have any children,
  42. //then let's store this element as it's first child
  43. if (element->child == NULL)
  44. {
  45. element->child = temp_element;
  46. element->child->parent = element;
  47. //the next line is not functional.
  48. element = element->child;
  49. element->name = token;
  50. }
  51. else //find the last child element, and store this as a sibling element
  52. {
  53. elementptr temp_traverse_element = element->child;
  54. while (temp_traverse_element->younger_sibling != NULL)
  55. {
  56. temp_traverse_element = temp_traverse_element->younger_sibling;
  57. }
  58. temp_traverse_element->younger_sibling = temp_element;
  59. temp_traverse_element = temp_element;
  60. temp_traverse_element->parent = element;
  61. element = temp_traverse_element;
  62. element->name = token;
  63. }
  64. // if this is a <br/> element, then we're done parsing
  65. // this element
  66. if (is_one_liner_element)
  67. {
  68. element->done_parsing = true;
  69. element = element->parent;
  70. }
  71. return element;
  72. }
  73. //FIXME make parsing set older_sibling
  74. elementptr parse_html (char * file_name) {
  75. /* const char string[] = "<html><title>Hello World!</title><body><p class=\"radical\">This is a cool paragraph</p></body></html>"; */
  76. FILE * html_file;
  77. if (html_file == NULL) /* make this check to see if the array has a string... */
  78. {
  79. html_file = fopen(file_name, "r");
  80. }
  81. const char delimiters[] = "<>";
  82. char *token, *cp;
  83. fseek (html_file, 0, SEEK_END);
  84. long length = ftell (html_file); //get current file position
  85. fseek (html_file, 0, SEEK_SET);
  86. char * buffer = (char *) malloc (length);
  87. fread (buffer, 1, length, html_file);
  88. token = strtok (buffer, delimiters); /* token => "words" */
  89. //printf("%s\n", token);
  90. elementptr first_element = allocate_new_element ();
  91. first_element->name = token;
  92. elementptr element = first_element;
  93. //the regexp should look like: "[a-zA-Z0-9!@#$%^&*(){}\[/?=+\|_;] +$
  94. // the above regexp suffers form an error. Namely that a line ending
  95. //in "] \n" won't match. unfortunately the simplier version
  96. //"[^ ] +$" matches an empty line that has whitespace.
  97. // [[info:libc#Matching%20POSIX%20Regexps][info:libc#Matching POSIX Regexps]]
  98. //compile the regular expression
  99. regmatch_t matchptr [12];
  100. regex_t regex_opening_element;
  101. regex_t regex_closing_element; //alphanumeric with a trailing slash
  102. regex_t regex_one_line_element;
  103. //since we are tokenizing "<" and ">", some of the tokens will be the
  104. //whitespace plus newlines between <div>s. So I need a way to ignore
  105. //those whitespace tokens
  106. regex_t regex_whitespace_and_newline;
  107. //check that regcomp compiles successfully
  108. // add REG_NO_SUB
  109. int regcompile_flags = REG_EXTENDED|REG_NOSUB;
  110. //this is not a complete regex. There are some elements that look like
  111. // <div class="hello" id="main"></div>
  112. //This regex, won't match the open as an opening element, because
  113. //it contains spaces...
  114. int error = regcomp (&regex_opening_element,
  115. "^[a-zA-Z0-9]+ +[a-zA-Z0-9]=[a-zA-Z0-9]$",
  116. regcompile_flags);
  117. if (error != 0)
  118. {
  119. printf("You got an error!\n");
  120. char string[50];
  121. regerror (error, &regex_opening_element, string,
  122. sizeof(char) * 50);
  123. return 0;
  124. }
  125. error = regcomp (&regex_closing_element, "^/[a-zA-Z0-9]+$",
  126. regcompile_flags);
  127. if (error != 0)
  128. {
  129. printf("You got another error!\n");
  130. return 0;
  131. }
  132. error = regcomp (&regex_one_line_element, "^[a-zA-Z0-9]+/$",
  133. regcompile_flags);
  134. if (error != 0)
  135. {
  136. printf("You got 1 more error!\n");
  137. return 0;
  138. }
  139. error = regcomp (&regex_whitespace_and_newline, "^[ \t\n]+$",
  140. regcompile_flags);
  141. if (error != 0)
  142. {
  143. printf("You got a whitespace error!\n");
  144. return 0;
  145. }
  146. //parse the buffer into tokens delimited by "<" and ">"
  147. //evaluate each token and store each token into the linked list:
  148. //element note that this code is not functional.
  149. // register_new_element will change the location of element.
  150. while ((token = strtok (NULL, delimiters)) != NULL) {
  151. /* if this is just whitespace, then ignore it */
  152. if (regexec (&regex_whitespace_and_newline, token, 0, 0, 0) == 0)
  153. {
  154. printf("whitespace is: %s\n", token);
  155. continue;
  156. }
  157. /* if this is an opening html element, then store it as a new
  158. element */
  159. if (regexec (&regex_opening_element, token, 0, 0, 0) == 0)
  160. {
  161. //the next line is NOT functional
  162. element = register_new_element (element, token, false);
  163. printf ("%s is an opening element\n");
  164. }
  165. else if (regexec (&regex_closing_element, token, 0, 0, 0) == 0)
  166. {
  167. //ok this is a closing element, let's close the current element
  168. element->done_parsing = true;
  169. //and make element by the parent element
  170. element = element->parent;
  171. printf("%s is a closing element \n", token);
  172. }
  173. else if (regexec (&regex_one_line_element, token, 0, 0, 0) == 0)
  174. {
  175. element = register_new_element (element, token, true);
  176. printf("%s is a one line element \n", token);
  177. }
  178. else {
  179. printf("%s is an element's content\n", token);
  180. element->contents = token;
  181. }
  182. }
  183. return first_element;
  184. }
  185. /*
  186. ** Let's print all the elements!
  187. */
  188. void print_elements (elementptr element) {
  189. /* if current element has child && younger_sibling element,
  190. ** then print them. */
  191. printf("<%s>", element->name);
  192. if (element->contents)
  193. printf("%s", element->contents);
  194. if (element->child && element->younger_sibling)
  195. {
  196. print_elements (element->child);
  197. printf("</%s>", element->name);
  198. print_elements (element->younger_sibling);
  199. }
  200. else if (element->child)
  201. {
  202. /* if the element only has a child element, then
  203. ** print the child. */
  204. print_elements (element->child);
  205. printf("</%s>", element->name);
  206. /* if the element only has a younger sibling,
  207. ** then print it. */
  208. }
  209. else if (element->younger_sibling)
  210. {
  211. printf("</%s>", element->name);
  212. print_elements (element->younger_sibling);
  213. }
  214. /* if we have reached the last element, then print it and return true. */
  215. if (element->younger_sibling == NULL &&
  216. element->child == NULL)
  217. { // I can probably remove all of this code block
  218. printf("</%s>", element->name);
  219. return;
  220. }
  221. }