|
@@ -1,6 +1,262 @@
|
|
|
#include "parser.h"
|
|
|
|
|
|
-void parse(FILE *stream)
|
|
|
+#include <wctype.h>
|
|
|
+#include <wchar.h>
|
|
|
+#include "utils/assert.h"
|
|
|
+#include "utils/logger.h"
|
|
|
+
|
|
|
+typedef enum
|
|
|
+{
|
|
|
+ #define ESTATE_ENUM(x) EState_##x,
|
|
|
+ #include "EState.inc"
|
|
|
+ #undef ESTATE_ENUM
|
|
|
+
|
|
|
+} EState;
|
|
|
+
|
|
|
+const char *EStateToStr(EState s)
|
|
|
+{
|
|
|
+ switch (s)
|
|
|
+ {
|
|
|
+ #define ESTATE_ENUM(x) case EState_##x: return #x;
|
|
|
+ #include "EState.inc"
|
|
|
+ #undef ESTATE_ENUM
|
|
|
+ }
|
|
|
+ return "INVALID VALUE OF EState";
|
|
|
+}
|
|
|
+
|
|
|
+void changeStateValue(EState *toChange, const EState newValue)
|
|
|
+{
|
|
|
+ if (newValue == *toChange)
|
|
|
+ {
|
|
|
+ return;
|
|
|
+ }
|
|
|
+ LOG_DEBUG("Changing EState value from %s to %s", EStateToStr(*toChange), EStateToStr(newValue));
|
|
|
+ *toChange = newValue;
|
|
|
+}
|
|
|
+
|
|
|
+EState parseBeforeTag(wint_t c);
|
|
|
+EState parseTagStart(wint_t c);
|
|
|
+EState parseTagName(wint_t c);
|
|
|
+EState parseTagWhiteSpace(wint_t c);
|
|
|
+EState parseTagAttributeName(wint_t c);
|
|
|
+EState parseText(wint_t c);
|
|
|
+EState parseClosingNoEndTag(wint_t c);
|
|
|
+
|
|
|
+HtmlDocument* parse(FILE *stream)
|
|
|
+{
|
|
|
+ ASSERT_MSG(stream != NULL, "NULL pointer provided");
|
|
|
+
|
|
|
+ EState state = EState_BeforeTag;
|
|
|
+
|
|
|
+ wint_t buffer[1024*50];
|
|
|
+ size_t bufferPosition = 0;
|
|
|
+
|
|
|
+ wint_t c;
|
|
|
+ while ((c = fgetwc(stream)) != WEOF)
|
|
|
+ {
|
|
|
+ // LOG_DEBUG("%lc", c);
|
|
|
+ switch (state)
|
|
|
+ {
|
|
|
+ case EState_BeforeTag:
|
|
|
+ {
|
|
|
+ EState newState = parseBeforeTag(c);
|
|
|
+ if (EState_ParsingText == newState)
|
|
|
+ {
|
|
|
+ ASSERT(bufferPosition == 0);
|
|
|
+ buffer[bufferPosition++] = c;
|
|
|
+ }
|
|
|
+ changeStateValue(&state, newState);
|
|
|
+ break;
|
|
|
+ }
|
|
|
+ case EState_ParsingTagStart:
|
|
|
+ {
|
|
|
+ EState newState = parseTagStart(c);
|
|
|
+ if (EState_Error == newState)
|
|
|
+ {
|
|
|
+ LOG_ERROR("Error during parsing");
|
|
|
+ return NULL;
|
|
|
+ }
|
|
|
+ if (EState_ParsingTagName == newState)
|
|
|
+ {
|
|
|
+ ASSERT(bufferPosition == 0);
|
|
|
+ buffer[bufferPosition++] = c;
|
|
|
+ }
|
|
|
+ changeStateValue(&state, newState);
|
|
|
+ break;
|
|
|
+ }
|
|
|
+ case EState_ParsingTagName:
|
|
|
+ {
|
|
|
+ EState newState = parseTagName(c);
|
|
|
+ if (newState == EState_ParsingTagName)
|
|
|
+ {
|
|
|
+ buffer[bufferPosition++] = c;
|
|
|
+ }
|
|
|
+ else
|
|
|
+ {
|
|
|
+ buffer[bufferPosition++] = '\0';
|
|
|
+ LOG_INFO("TAG NAME: '%ls'", buffer);
|
|
|
+ bufferPosition = 0;
|
|
|
+ changeStateValue(&state, newState);
|
|
|
+ }
|
|
|
+ break;
|
|
|
+ }
|
|
|
+ case EState_ParsingTagWhiteSpace:
|
|
|
+ {
|
|
|
+ EState newState = parseTagWhiteSpace(c);
|
|
|
+ changeStateValue(&state, newState);
|
|
|
+ break;
|
|
|
+ }
|
|
|
+ case EState_ParsingTagAttributeName:
|
|
|
+ {
|
|
|
+ EState newState = parseTagAttributeName(c);
|
|
|
+ changeStateValue(&state, newState);
|
|
|
+ break;
|
|
|
+ }
|
|
|
+ case EState_ParsingText:
|
|
|
+ {
|
|
|
+ EState newState = parseText(c);
|
|
|
+ if (newState == EState_ParsingText)
|
|
|
+ {
|
|
|
+ buffer[bufferPosition++] = c;
|
|
|
+ }
|
|
|
+ else
|
|
|
+ {
|
|
|
+ buffer[bufferPosition++] = '\0';
|
|
|
+ LOG_INFO("TEXT: '%ls'", buffer);
|
|
|
+ bufferPosition = 0;
|
|
|
+ changeStateValue(&state, newState);
|
|
|
+ }
|
|
|
+ break;
|
|
|
+ }
|
|
|
+ case EState_ClosingNoEndTag:
|
|
|
+ {
|
|
|
+ EState newState = parseClosingNoEndTag(c);
|
|
|
+ if (EState_Error == newState)
|
|
|
+ {
|
|
|
+ LOG_ERROR("Invalid syntax for closing no-end <tag/>");
|
|
|
+ return EState_Error;
|
|
|
+ }
|
|
|
+ changeStateValue(&state, newState);
|
|
|
+ break;
|
|
|
+ }
|
|
|
+ default:
|
|
|
+ {
|
|
|
+ LOG_ERROR("Unhandled state: %s", EStateToStr(state));
|
|
|
+ return EState_Error;
|
|
|
+ }
|
|
|
+ }
|
|
|
+ }
|
|
|
+
|
|
|
+ return NULL;
|
|
|
+}
|
|
|
+
|
|
|
+EState parseBeforeTag(wint_t c)
|
|
|
{
|
|
|
+ if ('<' != c)
|
|
|
+ {
|
|
|
+ return EState_ParsingText;
|
|
|
+ }
|
|
|
+ return EState_ParsingTagStart;
|
|
|
+}
|
|
|
+
|
|
|
+EState parseText(wint_t c)
|
|
|
+{
|
|
|
+ if ('<' != c)
|
|
|
+ {
|
|
|
+ return EState_ParsingText;
|
|
|
+ }
|
|
|
+ return EState_ParsingTagStart;
|
|
|
+}
|
|
|
|
|
|
+EState parseTagStart(wint_t c)
|
|
|
+{
|
|
|
+ if ('/' == c)
|
|
|
+ {
|
|
|
+ return EState_ParsingClosingTagStart;
|
|
|
+ }
|
|
|
+ if (!iswalnum(c) && '!' != c)
|
|
|
+ {
|
|
|
+ LOG_ERROR("Invalid char in tag name: %lc", c);
|
|
|
+ return EState_Error;
|
|
|
+ }
|
|
|
+ return EState_ParsingTagName;
|
|
|
+}
|
|
|
+
|
|
|
+EState parseTagName(wint_t c)
|
|
|
+{
|
|
|
+ if (iswalnum(c))
|
|
|
+ {
|
|
|
+ return EState_ParsingTagName;
|
|
|
+ }
|
|
|
+ if ('/' == c)
|
|
|
+ {
|
|
|
+ return EState_ClosingNoEndTag;
|
|
|
+ }
|
|
|
+ if ('>' == c)
|
|
|
+ {
|
|
|
+ return EState_BeforeTag;
|
|
|
+ }
|
|
|
+ if (' ' == c)
|
|
|
+ {
|
|
|
+ return EState_ParsingTagWhiteSpace;
|
|
|
+ }
|
|
|
+ return EState_ParsingTagName;
|
|
|
}
|
|
|
+
|
|
|
+EState parseTagWhiteSpace(wint_t c)
|
|
|
+{
|
|
|
+ if (' ' == c)
|
|
|
+ {
|
|
|
+ return EState_ParsingTagWhiteSpace;
|
|
|
+ }
|
|
|
+ if ('/' == c)
|
|
|
+ {
|
|
|
+ return EState_ClosingNoEndTag;
|
|
|
+ }
|
|
|
+ if ('>' == c)
|
|
|
+ {
|
|
|
+ return EState_BeforeTag;
|
|
|
+ }
|
|
|
+ if (iswalnum(c))
|
|
|
+ {
|
|
|
+ return EState_ParsingTagAttributeName;
|
|
|
+ }
|
|
|
+ LOG_ERROR("Invalid char '%lc'", c);
|
|
|
+ return EState_Error;
|
|
|
+}
|
|
|
+
|
|
|
+EState parseTagAttributeName(wint_t c)
|
|
|
+{
|
|
|
+ if (' ' == c)
|
|
|
+ {
|
|
|
+ return EState_ParsingTagWhiteSpace;
|
|
|
+ }
|
|
|
+ if ('/' == c)
|
|
|
+ {
|
|
|
+ return EState_ClosingNoEndTag;
|
|
|
+ }
|
|
|
+ if ('>' == c)
|
|
|
+ {
|
|
|
+ return EState_BeforeTag;
|
|
|
+ }
|
|
|
+ if ('=' == c)
|
|
|
+ {
|
|
|
+ return EState_ParsingTagAttributeAssignment;
|
|
|
+ }
|
|
|
+ if (iswalnum(c))
|
|
|
+ {
|
|
|
+ return EState_ParsingTagAttributeName;
|
|
|
+ }
|
|
|
+ LOG_ERROR("Invalid char '%lc'", c);
|
|
|
+ return EState_Error;
|
|
|
+}
|
|
|
+
|
|
|
+EState parseClosingNoEndTag(wint_t c)
|
|
|
+{
|
|
|
+ if ('>' == c)
|
|
|
+ {
|
|
|
+ return EState_BeforeTag;
|
|
|
+ }
|
|
|
+ LOG_ERROR("Invalid char '%lc'", c);
|
|
|
+ return EState_Error;
|
|
|
+}
|