jbranso
/
programming


			
				
					
						
						
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241
							#include <stdio.h>
#include <string.h>
/* #include <sys/stat.h>           /\* getting file size *\/ */
#include <stdlib.h>             /* calloc */
#include <stdbool.h>             
#include <regex.h>
#include <assert.h>
#include "types.h"

#define MAX_LENGTH_ELEMENT_NAME 15

void *
xmalloc (size_t size)
{
  void *value = malloc (size);
  assert(value != NULL);
  /* if (value == 0) */
  /*   fatal ("virtual memory exhausted"); */
  return value;
}

elementptr 
allocate_new_element ()
{
  //FIXME why do I need to say struct element here?
  elementptr element = (elementptr) xmalloc (sizeof (struct element));
  element->done_parsing = false;
  /* initialize the element's pointer's to NULL */
  element->child = element->younger_sibling =
    element->older_sibling  = NULL;
  element->contents = NULL;
  element->attribute = NULL;
  return element;
}

/*
 * This function returns a newly registered element.  The returned element has
 * the struct members defined. ie: element->name = <some string>
 */
elementptr
register_new_element (elementptr element, char * token,
                      bool is_one_liner_element)
{
  elementptr temp_element = allocate_new_element();
  //if the current element doesn't have any children,
  //then let's store this element as it's first child
  if (element->child == NULL)
    {
      element->child = temp_element;
      element->child->parent = element;
      //the next line is not functional.
      element = element->child;
      element->name = token;
    }
  else //find the last child element, and store this as a sibling element
    {
      elementptr temp_traverse_element = element->child;
      while (temp_traverse_element->younger_sibling != NULL)
        {
          temp_traverse_element = temp_traverse_element->younger_sibling;
        }
      temp_traverse_element->younger_sibling = temp_element;
      temp_traverse_element = temp_element;
      temp_traverse_element->parent = element;
      element = temp_traverse_element;
      element->name = token;
    }

  // if this is a <br/> element, then we're done parsing
  // this element
  if (is_one_liner_element)
    {
      element->done_parsing = true;
      element = element->parent;
    }
  return element;
}

//FIXME make parsing set older_sibling
elementptr parse_html (char * file_name) {
  /* const char string[] = "<html><title>Hello World!</title><body><p class=\"radical\">This is a cool paragraph</p></body></html>"; */
  FILE * html_file;
  if (html_file == NULL) /* make this check to see if the array has a string... */
    {
      html_file = fopen(file_name, "r");
    }
  
  const char delimiters[] = "<>";
  char *token, *cp;

  fseek (html_file, 0, SEEK_END);
  long length = ftell (html_file);  //get current file position
  fseek (html_file, 0, SEEK_SET);
  char * buffer = (char *) malloc (length);
  fread (buffer, 1, length, html_file);
  token = strtok (buffer, delimiters);      /* token => "words" */
  //printf("%s\n", token);

  elementptr first_element = allocate_new_element ();
  first_element->name = token;

  elementptr element = first_element;

  //the regexp should look like: "[a-zA-Z0-9!@#$%^&*(){}\[/?=+\|_;] +$
  // the above regexp suffers form an error.  Namely that a line ending
  //in "]  \n" won't match. unfortunately the simplier version
  //"[^ ] +$" matches an empty line that has whitespace.
  // [[info:libc#Matching%20POSIX%20Regexps][info:libc#Matching POSIX Regexps]]
  //compile the regular expression
  regmatch_t matchptr [12];
  regex_t regex_opening_element;
  regex_t regex_closing_element; //alphanumeric with a trailing slash
  regex_t regex_one_line_element;
  //since we are tokenizing "<" and ">", some of the tokens will be the
  //whitespace plus newlines between <div>s.  So I need a way to ignore
  //those whitespace tokens
  regex_t regex_whitespace_and_newline;
  //check that regcomp compiles successfully
  // add REG_NO_SUB
  int regcompile_flags = REG_EXTENDED|REG_NOSUB;
  //this is not a complete regex.  There are some elements that look like
  // <div class="hello" id="main"></div>
  //This regex, won't match the open as an opening element, because
  //it contains spaces...
  int error = regcomp (&regex_opening_element,
                       "^[a-zA-Z0-9]+ +[a-zA-Z0-9]=[a-zA-Z0-9]$",
                       regcompile_flags);
  if (error != 0)
    {
      printf("You got an error!\n");
      char string[50];
      regerror (error, &regex_opening_element, string,
                sizeof(char) * 50);
      return 0;
    }
  error = regcomp (&regex_closing_element, "^/[a-zA-Z0-9]+$",
                   regcompile_flags);
  if (error != 0)
    {
      printf("You got another error!\n");
      return 0;
    }
  error = regcomp (&regex_one_line_element, "^[a-zA-Z0-9]+/$",
                   regcompile_flags);
  if (error != 0)
    {
      printf("You got 1 more error!\n");
      return 0;
    }

  error = regcomp (&regex_whitespace_and_newline, "^[ \t\n]+$",
                   regcompile_flags);
  if (error != 0)
    {
      printf("You got a whitespace error!\n");
      return 0;
    }

  //parse the buffer into tokens delimited by "<" and ">"
  //evaluate each token and store each token into the linked list:
  //element note that this code is not functional.
  // register_new_element will change the location of element.
  while ((token = strtok (NULL, delimiters)) != NULL) {

    /* if this is just whitespace, then ignore it */
    if (regexec (&regex_whitespace_and_newline, token, 0, 0, 0) == 0)
      {
        printf("whitespace is: %s\n", token);
        continue;
      }
    /* if this is an opening html element, then store it as a new
       element */
    if (regexec (&regex_opening_element, token, 0, 0, 0) == 0)
      {
        //the next line is NOT functional
        element = register_new_element (element, token, false);
        printf ("%s is an opening element\n");
      }
    else if (regexec (&regex_closing_element, token, 0, 0, 0) == 0)
      {
        //ok this is a closing element, let's close the current element
        element->done_parsing = true;
        //and make element by the parent element
        element = element->parent;
        printf("%s is a closing element \n", token);
      }
    else if (regexec (&regex_one_line_element, token, 0, 0, 0) == 0)
      {
        element = register_new_element (element, token, true);
        printf("%s is a one line element \n", token);
      }
    else {
      printf("%s is an element's content\n", token);
      element->contents = token;
    }
  }
  return first_element;
}

/*
** Let's print all the elements!
*/
void print_elements (elementptr element) {
  /* if current element has child && younger_sibling element,
  **  then print them. */

  printf("<%s>", element->name);
  if (element->contents)
    printf("%s", element->contents);
  
  if (element->child && element->younger_sibling)
    {
      print_elements (element->child);
      printf("</%s>", element->name);
      print_elements (element->younger_sibling);
    }
  else if (element->child)
    {
      /* if the element only has a child element, then
      ** print the child. */
      print_elements (element->child);
      printf("</%s>", element->name);
      /* if the element only has a younger sibling,
      ** then print it. */
    }
  else if (element->younger_sibling)
    {
      printf("</%s>", element->name);
      print_elements (element->younger_sibling);
    }
  /* if we have reached the last element, then print it and return true. */
  if (element->younger_sibling == NULL &&
      element->child == NULL)
    { // I can probably remove all of this code block
      printf("</%s>", element->name);
      return;
    } 
}