5 次代码提交 f78ac6f2ed ... 27ba2d3465

作者 SHA1 备注 提交日期
  rafal_rr 27ba2d3465 - partial parser impl 5 年之前
  rafal_rr cb0cb97352 - parser_tests implementation 5 年之前
  rafal_rr 84cf734c01 -bug introduced during conflict resolution corrected 5 年之前
  rafal_rr 30344dffe5 - makefile improvemnet 5 年之前
  rafal_rr 3a36ced877 test parser: draft 5 年之前
共有 8 个文件被更改,包括 349 次插入3 次删除
  1. 5 0
      .gitignore
  2. 18 1
      Makefile
  3. 13 0
      src/parser/EState.inc
  4. 257 1
      src/parser/parser.c
  5. 6 1
      src/parser/parser.h
  6. 9 0
      src/utils/assert.h
  7. 38 0
      test/parser_tests/parser_tests.c
  8. 3 0
      test/parser_tests/run_parsers_tests1.sh

+ 5 - 0
.gitignore

@@ -42,6 +42,11 @@
 
 # Binary output file
 tgnews*
+parser_tests
 
 # Build directory
 build/*
+
+
+# Input files from contests.com. Link it to ./data
+data

+ 18 - 1
Makefile

@@ -1,6 +1,7 @@
 CC = gcc
 
 srcdir = src
+testdir = test
 objdir = build
 target = tgnews
 target_trainer = tgnews_trainer
@@ -14,11 +15,14 @@ includes_lib := -I$(srcdir)
 src := $(wildcard $(srcdir)/*/*.c)
 src_target_trainer := $(srcdir)/main_trainer.c
 src_target := $(srcdir)/main.c
+src_test := $(wildcard $(testdir)/*/*.c)
 obj_target_trainer := $(src_target_trainer:$(srcdir)/%.c=$(objdir)/%.o)
 obj_target := $(src_target:$(srcdir)/%.c=$(objdir)/%.o)
 obj := $(src:$(srcdir)/%.c=$(objdir)/%.o)
+obj_test := $(src_test:$(testdir)/%.c=$(objdir)/test/%.o)
 obj_all := $(obj) $(obj_target) $(obj_target_trainer)
 dep := $(obj:%.o=%.d)
+dep_test := $(obj_test:%.o=%.d)
 
 all: output_directories $(target_lib) $(target) $(target_trainer)
 
@@ -27,6 +31,9 @@ debug: $(eval CFLAGS += -g) all
 output_directories:
 	@mkdir -p $(objdir) $(dir $(obj))
 
+output_directories_test:
+	@mkdir -p $(objdir) $(dir $(obj_test))
+
 $(target_lib): $(obj)
 	ar rcs $@ $^
 
@@ -36,11 +43,21 @@ $(target_trainer): $(obj_target_trainer) $(target_lib)
 $(target): $(obj_target) $(target_lib)
 	$(CC) -o $@ $^ $(LDFLAGS)
 
+target_parser_tests := $(shell basename $(src_test) .c )
+
+$(target_parser_tests): $(obj_test) $(target_lib)
+	$(CC) -o $@ $^ $(LDFLAGS)
+
+test: output_directories output_directories_test $(target_parser_tests)
+
 -include $(dep)
 $(obj_all): $(objdir)/%.o : $(srcdir)/%.c
 	$(CC) $(CFLAGS) -o $@ -MMD -c $< $(includes_lib)
+-include $(dep_test)
+$(obj_test): $(objdir)/%.o : %.c
+	$(CC) $(CFLAGS) -o $@ -MMD -c $< $(includes_lib)
 
 .PHONY: clean output_directories debug
 clean:
 	@rm -fr build
-	@rm -f $(target) $(target_trainer)
+	@rm -f $(target) $(target_trainer) $(target_parser_tests)

+ 13 - 0
src/parser/EState.inc

@@ -0,0 +1,13 @@
+ESTATE_ENUM(Error)
+ESTATE_ENUM(BeforeTag)
+ESTATE_ENUM(ParsingTagStart)
+ESTATE_ENUM(ParsingTagWhiteSpace)
+ESTATE_ENUM(ParsingTagAttributeName)
+ESTATE_ENUM(ParsingTagAttributeAssignment)
+ESTATE_ENUM(ParsingTagAttributeOneWordValue)
+ESTATE_ENUM(ParsingTagAttributeLongValue)
+ESTATE_ENUM(ParsingTagName)
+ESTATE_ENUM(ParsingClosingTagStart)
+ESTATE_ENUM(ParsingClosingTagName)
+ESTATE_ENUM(ClosingNoEndTag)
+ESTATE_ENUM(ParsingText)

+ 257 - 1
src/parser/parser.c

@@ -1,6 +1,262 @@
 #include "parser.h"
 
-void parse(FILE *stream)
+#include <wctype.h>
+#include <wchar.h>
+#include "utils/assert.h"
+#include "utils/logger.h"
+
+typedef enum
+{
+    #define ESTATE_ENUM(x) EState_##x,
+    #include "EState.inc"
+    #undef ESTATE_ENUM
+
+} EState;
+
+const char *EStateToStr(EState s)
+{
+    switch (s)
+    {
+        #define ESTATE_ENUM(x) case EState_##x: return #x;
+        #include "EState.inc"
+        #undef ESTATE_ENUM
+    }
+    return "INVALID VALUE OF EState";
+}
+
+void changeStateValue(EState *toChange, const EState newValue)
+{
+    if (newValue == *toChange)
+    {
+        return;
+    }
+    LOG_DEBUG("Changing EState value from %s to %s", EStateToStr(*toChange), EStateToStr(newValue));
+    *toChange = newValue;
+}
+
+EState parseBeforeTag(wint_t c);
+EState parseTagStart(wint_t c);
+EState parseTagName(wint_t c);
+EState parseTagWhiteSpace(wint_t c);
+EState parseTagAttributeName(wint_t c);
+EState parseText(wint_t c);
+EState parseClosingNoEndTag(wint_t c);
+
+HtmlDocument* parse(FILE *stream)
+{
+    ASSERT_MSG(stream != NULL, "NULL pointer provided");
+
+    EState state = EState_BeforeTag;
+
+    wint_t buffer[1024*50];
+    size_t bufferPosition = 0;
+
+    wint_t c;
+    while ((c = fgetwc(stream)) != WEOF)
+    {
+        // LOG_DEBUG("%lc", c);
+        switch (state)
+        {
+            case EState_BeforeTag:
+            {
+                EState newState = parseBeforeTag(c);
+                if (EState_ParsingText == newState)
+                {
+                    ASSERT(bufferPosition == 0);
+                    buffer[bufferPosition++] = c;
+                }
+                changeStateValue(&state, newState);
+                break;
+            }
+            case EState_ParsingTagStart:
+            {
+                EState newState = parseTagStart(c);
+                if (EState_Error == newState)
+                {
+                    LOG_ERROR("Error during parsing");
+                    return NULL;
+                }
+                if (EState_ParsingTagName == newState)
+                {
+                    ASSERT(bufferPosition == 0);
+                    buffer[bufferPosition++] = c;
+                }
+                changeStateValue(&state, newState);
+                break;
+            }
+            case EState_ParsingTagName:
+            {
+                EState newState = parseTagName(c);
+                if (newState == EState_ParsingTagName)
+                {
+                    buffer[bufferPosition++] = c;
+                }
+                else
+                {
+                    buffer[bufferPosition++] = '\0';
+                    LOG_INFO("TAG NAME: '%ls'", buffer);
+                    bufferPosition = 0;
+                    changeStateValue(&state, newState);
+                }
+                break;
+            }
+            case EState_ParsingTagWhiteSpace:
+            {
+                EState newState = parseTagWhiteSpace(c);
+                changeStateValue(&state, newState);
+                break;
+            }
+            case EState_ParsingTagAttributeName:
+            {
+                EState newState = parseTagAttributeName(c);
+                changeStateValue(&state, newState);
+                break;
+            }
+            case EState_ParsingText:
+            {
+                EState newState = parseText(c);
+                if (newState == EState_ParsingText)
+                {
+                    buffer[bufferPosition++] = c;
+                }
+                else
+                {
+                    buffer[bufferPosition++] = '\0';
+                    LOG_INFO("TEXT: '%ls'", buffer);
+                    bufferPosition = 0;
+                    changeStateValue(&state, newState);
+                }
+                break;
+            }
+            case EState_ClosingNoEndTag:
+            {
+                EState newState = parseClosingNoEndTag(c);
+                if (EState_Error == newState)
+                {
+                    LOG_ERROR("Invalid syntax for closing no-end <tag/>");
+                    return EState_Error;
+                }
+                changeStateValue(&state, newState);
+                break;
+            }
+            default:
+            {
+                LOG_ERROR("Unhandled state: %s", EStateToStr(state));
+                return EState_Error;
+            }
+        }
+    }
+
+    return NULL;
+}
+
+EState parseBeforeTag(wint_t c)
 {
+    if ('<' != c)
+    {
+        return EState_ParsingText;
+    }
+    return EState_ParsingTagStart;
+}
+
+EState parseText(wint_t c)
+{
+    if ('<' != c)
+    {
+        return EState_ParsingText;
+    }
+    return EState_ParsingTagStart;
+}
 
+EState parseTagStart(wint_t c)
+{
+    if ('/' == c)
+    {
+        return EState_ParsingClosingTagStart;
+    }
+    if (!iswalnum(c) && '!' != c)
+    {
+        LOG_ERROR("Invalid char in tag name: %lc", c);
+        return EState_Error;
+    }
+    return EState_ParsingTagName;
+}
+
+EState parseTagName(wint_t c)
+{
+    if (iswalnum(c))
+    {
+        return EState_ParsingTagName;
+    }
+    if ('/' == c)
+    {
+        return EState_ClosingNoEndTag;
+    }
+    if ('>' == c)
+    {
+        return EState_BeforeTag;
+    }
+    if (' ' == c)
+    {
+        return EState_ParsingTagWhiteSpace;
+    }
+    return EState_ParsingTagName;
 }
+
+EState parseTagWhiteSpace(wint_t c)
+{
+    if (' ' == c)
+    {
+        return EState_ParsingTagWhiteSpace;
+    }
+    if ('/' == c)
+    {
+        return EState_ClosingNoEndTag;
+    }
+    if ('>' == c)
+    {
+        return EState_BeforeTag;
+    }
+    if (iswalnum(c))
+    {
+        return EState_ParsingTagAttributeName;
+    }
+    LOG_ERROR("Invalid char '%lc'", c);
+    return EState_Error;
+}
+
+EState parseTagAttributeName(wint_t c)
+{
+    if (' ' == c)
+    {
+        return EState_ParsingTagWhiteSpace;
+    }
+    if ('/' == c)
+    {
+        return EState_ClosingNoEndTag;
+    }
+    if ('>' == c)
+    {
+        return EState_BeforeTag;
+    }
+    if ('=' == c)
+    {
+        return EState_ParsingTagAttributeAssignment;
+    }
+    if (iswalnum(c))
+    {
+        return EState_ParsingTagAttributeName;
+    }
+    LOG_ERROR("Invalid char '%lc'", c);
+    return EState_Error;
+}
+
+EState parseClosingNoEndTag(wint_t c)
+{
+    if ('>' == c)
+    {
+        return EState_BeforeTag;
+    }
+    LOG_ERROR("Invalid char '%lc'", c);
+    return EState_Error;
+}

+ 6 - 1
src/parser/parser.h

@@ -1,8 +1,13 @@
 #ifndef PARSER_H
 #define PARSER_H
 
+#include <stdint.h>
 #include <stdio.h>
 
-void parse(FILE *stream);
+typedef struct
+{
+} HtmlDocument;
+
+HtmlDocument *parse(FILE *stream);
 
 #endif // PARSER_H

+ 9 - 0
src/utils/assert.h

@@ -0,0 +1,9 @@
+#ifndef ASSERT_H
+#define ASSERT_H
+
+#include <assert.h>
+
+#define ASSERT(cond) assert(cond)
+#define ASSERT_MSG(cond, msg) assert((cond)&&(msg))
+
+#endif  // ASSERT_H

+ 38 - 0
test/parser_tests/parser_tests.c

@@ -0,0 +1,38 @@
+#include <locale.h>
+#include <time.h>
+#include "parser/parser.h"
+#include "utils/logger.h"
+
+int main(int argc, char *argv[])
+{
+    if (argc < 2)
+    {
+        LOG_ERROR("Provide file, please");
+        exit(EXIT_FAILURE);
+    }
+    const char *filename = argv[1];
+    LOG_INFO("File to parse: '%s'", filename);
+
+    FILE * pFile;
+    char *locale = setlocale(LC_ALL, "");
+    pFile = fopen (filename,"r");
+    if (!pFile)
+    {
+        LOG_ERROR("Couldn't open '%s'", filename);
+        exit(EXIT_FAILURE);
+    }
+    LOG_INFO("Starting parsing");
+    clock_t clockStart = clock();
+    const HtmlDocument *doc = parse(pFile);
+    clock_t clockEnd = clock();
+    LOG_INFO("Parsing finished. It took %f miliseconds", ((float)(clockEnd - clockStart))/CLOCKS_PER_SEC*1000.0);
+    fclose (pFile);
+
+    if (!doc)
+    {
+        LOG_ERROR("Error during parsing file '%s'", filename);
+        exit(EXIT_FAILURE);
+    }
+
+    return 0;
+}

+ 3 - 0
test/parser_tests/run_parsers_tests1.sh

@@ -0,0 +1,3 @@
+#!/bin/bash
+
+make test CFLAGS=-DLOGGER_SEVERITY=0 && ./parser_tests data/DataClusteringSample0107/20191103/21/29040143744473890.html