123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241 |
- #include <stdio.h>
- #include <string.h>
- /* #include <sys/stat.h> /\* getting file size *\/ */
- #include <stdlib.h> /* calloc */
- #include <stdbool.h>
- #include <regex.h>
- #include <assert.h>
- #include "types.h"
- #define MAX_LENGTH_ELEMENT_NAME 15
- void *
- xmalloc (size_t size)
- {
- void *value = malloc (size);
- assert(value != NULL);
- /* if (value == 0) */
- /* fatal ("virtual memory exhausted"); */
- return value;
- }
- elementptr
- allocate_new_element ()
- {
- //FIXME why do I need to say struct element here?
- elementptr element = (elementptr) xmalloc (sizeof (struct element));
- element->done_parsing = false;
- /* initialize the element's pointer's to NULL */
- element->child = element->younger_sibling =
- element->older_sibling = NULL;
- element->contents = NULL;
- element->attribute = NULL;
- return element;
- }
- /*
- * This function returns a newly registered element. The returned element has
- * the struct members defined. ie: element->name = <some string>
- */
- elementptr
- register_new_element (elementptr element, char * token,
- bool is_one_liner_element)
- {
- elementptr temp_element = allocate_new_element();
- //if the current element doesn't have any children,
- //then let's store this element as it's first child
- if (element->child == NULL)
- {
- element->child = temp_element;
- element->child->parent = element;
- //the next line is not functional.
- element = element->child;
- element->name = token;
- }
- else //find the last child element, and store this as a sibling element
- {
- elementptr temp_traverse_element = element->child;
- while (temp_traverse_element->younger_sibling != NULL)
- {
- temp_traverse_element = temp_traverse_element->younger_sibling;
- }
- temp_traverse_element->younger_sibling = temp_element;
- temp_traverse_element = temp_element;
- temp_traverse_element->parent = element;
- element = temp_traverse_element;
- element->name = token;
- }
- // if this is a <br/> element, then we're done parsing
- // this element
- if (is_one_liner_element)
- {
- element->done_parsing = true;
- element = element->parent;
- }
- return element;
- }
- //FIXME make parsing set older_sibling
- elementptr parse_html (char * file_name) {
- /* const char string[] = "<html><title>Hello World!</title><body><p class=\"radical\">This is a cool paragraph</p></body></html>"; */
- FILE * html_file;
- if (html_file == NULL) /* make this check to see if the array has a string... */
- {
- html_file = fopen(file_name, "r");
- }
-
- const char delimiters[] = "<>";
- char *token, *cp;
- fseek (html_file, 0, SEEK_END);
- long length = ftell (html_file); //get current file position
- fseek (html_file, 0, SEEK_SET);
- char * buffer = (char *) malloc (length);
- fread (buffer, 1, length, html_file);
- token = strtok (buffer, delimiters); /* token => "words" */
- //printf("%s\n", token);
- elementptr first_element = allocate_new_element ();
- first_element->name = token;
- elementptr element = first_element;
- //the regexp should look like: "[a-zA-Z0-9!@#$%^&*(){}\[/?=+\|_;] +$
- // the above regexp suffers form an error. Namely that a line ending
- //in "] \n" won't match. unfortunately the simplier version
- //"[^ ] +$" matches an empty line that has whitespace.
- // [[info:libc#Matching%20POSIX%20Regexps][info:libc#Matching POSIX Regexps]]
- //compile the regular expression
- regmatch_t matchptr [12];
- regex_t regex_opening_element;
- regex_t regex_closing_element; //alphanumeric with a trailing slash
- regex_t regex_one_line_element;
- //since we are tokenizing "<" and ">", some of the tokens will be the
- //whitespace plus newlines between <div>s. So I need a way to ignore
- //those whitespace tokens
- regex_t regex_whitespace_and_newline;
- //check that regcomp compiles successfully
- // add REG_NO_SUB
- int regcompile_flags = REG_EXTENDED|REG_NOSUB;
- //this is not a complete regex. There are some elements that look like
- // <div class="hello" id="main"></div>
- //This regex, won't match the open as an opening element, because
- //it contains spaces...
- int error = regcomp (®ex_opening_element,
- "^[a-zA-Z0-9]+ +[a-zA-Z0-9]=[a-zA-Z0-9]$",
- regcompile_flags);
- if (error != 0)
- {
- printf("You got an error!\n");
- char string[50];
- regerror (error, ®ex_opening_element, string,
- sizeof(char) * 50);
- return 0;
- }
- error = regcomp (®ex_closing_element, "^/[a-zA-Z0-9]+$",
- regcompile_flags);
- if (error != 0)
- {
- printf("You got another error!\n");
- return 0;
- }
- error = regcomp (®ex_one_line_element, "^[a-zA-Z0-9]+/$",
- regcompile_flags);
- if (error != 0)
- {
- printf("You got 1 more error!\n");
- return 0;
- }
- error = regcomp (®ex_whitespace_and_newline, "^[ \t\n]+$",
- regcompile_flags);
- if (error != 0)
- {
- printf("You got a whitespace error!\n");
- return 0;
- }
- //parse the buffer into tokens delimited by "<" and ">"
- //evaluate each token and store each token into the linked list:
- //element note that this code is not functional.
- // register_new_element will change the location of element.
- while ((token = strtok (NULL, delimiters)) != NULL) {
- /* if this is just whitespace, then ignore it */
- if (regexec (®ex_whitespace_and_newline, token, 0, 0, 0) == 0)
- {
- printf("whitespace is: %s\n", token);
- continue;
- }
- /* if this is an opening html element, then store it as a new
- element */
- if (regexec (®ex_opening_element, token, 0, 0, 0) == 0)
- {
- //the next line is NOT functional
- element = register_new_element (element, token, false);
- printf ("%s is an opening element\n");
- }
- else if (regexec (®ex_closing_element, token, 0, 0, 0) == 0)
- {
- //ok this is a closing element, let's close the current element
- element->done_parsing = true;
- //and make element by the parent element
- element = element->parent;
- printf("%s is a closing element \n", token);
- }
- else if (regexec (®ex_one_line_element, token, 0, 0, 0) == 0)
- {
- element = register_new_element (element, token, true);
- printf("%s is a one line element \n", token);
- }
- else {
- printf("%s is an element's content\n", token);
- element->contents = token;
- }
- }
- return first_element;
- }
- /*
- ** Let's print all the elements!
- */
- void print_elements (elementptr element) {
- /* if current element has child && younger_sibling element,
- ** then print them. */
- printf("<%s>", element->name);
- if (element->contents)
- printf("%s", element->contents);
-
- if (element->child && element->younger_sibling)
- {
- print_elements (element->child);
- printf("</%s>", element->name);
- print_elements (element->younger_sibling);
- }
- else if (element->child)
- {
- /* if the element only has a child element, then
- ** print the child. */
- print_elements (element->child);
- printf("</%s>", element->name);
- /* if the element only has a younger sibling,
- ** then print it. */
- }
- else if (element->younger_sibling)
- {
- printf("</%s>", element->name);
- print_elements (element->younger_sibling);
- }
- /* if we have reached the last element, then print it and return true. */
- if (element->younger_sibling == NULL &&
- element->child == NULL)
- { // I can probably remove all of this code block
- printf("</%s>", element->name);
- return;
- }
- }
|