1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069 |
- #include <errno.h>
- #include <stdint.h>
- #include <stdio.h>
- #include <stdlib.h>
- #include <string.h>
- #include <strings.h>
- #include "util.h"
- #include "xml.h"
- #define ISINCONTENT(ctx) ((ctx).iscontent && !((ctx).iscontenttag))
- #define ISCONTENTTAG(ctx) (!((ctx).iscontent) && (ctx).iscontenttag)
- /* these feed fields support multiple separated values */
- #define ISFEEDFIELDMULTI(t) ((t) == FeedFieldCategory)
- /* string and byte-length */
- #define STRP(s) s,sizeof(s)-1
- enum FeedType {
- FeedTypeNone = 0,
- FeedTypeRSS = 1,
- FeedTypeAtom = 2
- };
- enum ContentType {
- ContentTypeNone = 0,
- ContentTypePlain = 1,
- ContentTypeHTML = 2
- };
- static const char *contenttypes[] = { "", "plain", "html" };
- /* String data / memory pool */
- typedef struct string {
- char *data; /* data */
- size_t len; /* string length */
- size_t bufsiz; /* allocated size */
- } String;
- /* NOTE: the order of these fields (content, date, author) indicate the
- * priority to use them, from least important to high. */
- enum TagId {
- TagUnknown = 0,
- /* RSS */
- RSSTagDcdate, RSSTagPubdate, /* creation date has higher priority */
- RSSTagTitle,
- RSSTagMediaDescription, RSSTagDescription, RSSTagContentEncoded,
- RSSTagGuid,
- RSSTagGuidPermalinkFalse,
- RSSTagGuidPermalinkTrue,
- /* must be defined after GUID, because it can be a link (isPermaLink) */
- RSSTagLink,
- RSSTagEnclosure,
- RSSTagAuthor, RSSTagDccreator,
- RSSTagCategory,
- /* Atom */
- /* creation date has higher priority */
- AtomTagModified, AtomTagUpdated, AtomTagIssued, AtomTagPublished,
- AtomTagTitle,
- AtomTagMediaDescription, AtomTagSummary, AtomTagContent,
- AtomTagId,
- AtomTagLink,
- AtomTagLinkAlternate,
- AtomTagLinkEnclosure,
- AtomTagAuthor, AtomTagAuthorName,
- AtomTagCategory,
- TagLast
- };
- typedef struct feedtag {
- char *name; /* name of tag to match */
- size_t len; /* len of `name` */
- enum TagId id; /* unique ID */
- } FeedTag;
- typedef struct field {
- String str;
- enum TagId tagid; /* tagid set previously, used for tag priority */
- } FeedField;
- enum {
- FeedFieldTime = 0, FeedFieldTitle, FeedFieldLink, FeedFieldContent,
- FeedFieldId, FeedFieldAuthor, FeedFieldEnclosure, FeedFieldCategory,
- FeedFieldLast
- };
- typedef struct feedcontext {
- String *field; /* current FeedItem field String */
- FeedField fields[FeedFieldLast]; /* data for current item */
- FeedTag tag; /* unique current parsed tag */
- int iscontent; /* in content data */
- int iscontenttag; /* in content tag */
- enum ContentType contenttype; /* content-type for item */
- enum FeedType feedtype;
- int attrcount; /* count item HTML element attributes */
- } FeedContext;
- static long long datetounix(long long, int, int, int, int, int);
- static FeedTag * gettag(enum FeedType, const char *, size_t);
- static long gettzoffset(const char *);
- static int isattr(const char *, size_t, const char *, size_t);
- static int istag(const char *, size_t, const char *, size_t);
- static int parsetime(const char *, long long *);
- static void printfields(void);
- static void string_append(String *, const char *, size_t);
- static void string_buffer_realloc(String *, size_t);
- static void string_clear(String *);
- static void string_print_encoded(String *);
- static void string_print_timestamp(String *);
- static void string_print_trimmed(String *);
- static void string_print_trimmed_multi(String *);
- static void string_print_uri(String *);
- static void xmlattr(XMLParser *, const char *, size_t, const char *, size_t,
- const char *, size_t);
- static void xmlattrentity(XMLParser *, const char *, size_t, const char *,
- size_t, const char *, size_t);
- static void xmlattrend(XMLParser *, const char *, size_t, const char *,
- size_t);
- static void xmlattrstart(XMLParser *, const char *, size_t, const char *,
- size_t);
- static void xmldata(XMLParser *, const char *, size_t);
- static void xmldataentity(XMLParser *, const char *, size_t);
- static void xmltagend(XMLParser *, const char *, size_t, int);
- static void xmltagstart(XMLParser *, const char *, size_t);
- static void xmltagstartparsed(XMLParser *, const char *, size_t, int);
- /* map tag name to TagId type */
- /* RSS, must be alphabetical order */
- static const FeedTag rsstags[] = {
- { STRP("author"), RSSTagAuthor },
- { STRP("category"), RSSTagCategory },
- { STRP("content:encoded"), RSSTagContentEncoded },
- { STRP("dc:creator"), RSSTagDccreator },
- { STRP("dc:date"), RSSTagDcdate },
- { STRP("description"), RSSTagDescription },
- /* RSS: <enclosure url="" />, Atom has <link rel="enclosure" /> */
- { STRP("enclosure"), RSSTagEnclosure },
- { STRP("guid"), RSSTagGuid },
- { STRP("link"), RSSTagLink },
- { STRP("media:description"), RSSTagMediaDescription },
- { STRP("pubdate"), RSSTagPubdate },
- { STRP("title"), RSSTagTitle }
- };
- /* Atom, must be alphabetical order */
- static const FeedTag atomtags[] = {
- { STRP("author"), AtomTagAuthor },
- { STRP("category"), AtomTagCategory },
- { STRP("content"), AtomTagContent },
- { STRP("id"), AtomTagId },
- { STRP("issued"), AtomTagIssued }, /* Atom 0.3 */
- /* Atom: <link href="" />, RSS has <link></link> */
- { STRP("link"), AtomTagLink },
- { STRP("media:description"), AtomTagMediaDescription },
- { STRP("modified"), AtomTagModified }, /* Atom 0.3 */
- { STRP("published"), AtomTagPublished },
- { STRP("summary"), AtomTagSummary },
- { STRP("title"), AtomTagTitle },
- { STRP("updated"), AtomTagUpdated }
- };
- /* special case: nested <author><name> */
- static const FeedTag atomtagauthor = { STRP("author"), AtomTagAuthor };
- static const FeedTag atomtagauthorname = { STRP("name"), AtomTagAuthorName };
- /* reference to no / unknown tag */
- static const FeedTag notag = { STRP(""), TagUnknown };
- /* map TagId type to RSS/Atom field, all tags must be defined */
- static const int fieldmap[TagLast] = {
- [TagUnknown] = -1,
- /* RSS */
- [RSSTagDcdate] = FeedFieldTime,
- [RSSTagPubdate] = FeedFieldTime,
- [RSSTagTitle] = FeedFieldTitle,
- [RSSTagMediaDescription] = FeedFieldContent,
- [RSSTagDescription] = FeedFieldContent,
- [RSSTagContentEncoded] = FeedFieldContent,
- [RSSTagGuid] = -1,
- [RSSTagGuidPermalinkFalse] = FeedFieldId,
- [RSSTagGuidPermalinkTrue] = FeedFieldId, /* special-case: both a link and an id */
- [RSSTagLink] = FeedFieldLink,
- [RSSTagEnclosure] = FeedFieldEnclosure,
- [RSSTagAuthor] = FeedFieldAuthor,
- [RSSTagDccreator] = FeedFieldAuthor,
- [RSSTagCategory] = FeedFieldCategory,
- /* Atom */
- [AtomTagModified] = FeedFieldTime,
- [AtomTagUpdated] = FeedFieldTime,
- [AtomTagIssued] = FeedFieldTime,
- [AtomTagPublished] = FeedFieldTime,
- [AtomTagTitle] = FeedFieldTitle,
- [AtomTagMediaDescription] = FeedFieldContent,
- [AtomTagSummary] = FeedFieldContent,
- [AtomTagContent] = FeedFieldContent,
- [AtomTagId] = FeedFieldId,
- [AtomTagLink] = -1,
- [AtomTagLinkAlternate] = FeedFieldLink,
- [AtomTagLinkEnclosure] = FeedFieldEnclosure,
- [AtomTagAuthor] = -1,
- [AtomTagAuthorName] = FeedFieldAuthor,
- [AtomTagCategory] = FeedFieldCategory
- };
- static const int FieldSeparator = '\t';
- /* separator for multiple values in a field, separator should be 1 byte */
- static const char FieldMultiSeparator[] = "|";
- static struct uri baseuri;
- static const char *baseurl;
- static FeedContext ctx;
- static XMLParser parser; /* XML parser state */
- static String attrispermalink, attrrel, attrtype, tmpstr;
- static int
- tagcmp(const void *v1, const void *v2)
- {
- return strcasecmp(((FeedTag *)v1)->name, ((FeedTag *)v2)->name);
- }
- /* Unique tagid for parsed tag name. */
- static FeedTag *
- gettag(enum FeedType feedtype, const char *name, size_t namelen)
- {
- FeedTag f, *r = NULL;
- f.name = (char *)name;
- switch (feedtype) {
- case FeedTypeRSS:
- r = bsearch(&f, rsstags, sizeof(rsstags) / sizeof(rsstags[0]),
- sizeof(rsstags[0]), tagcmp);
- break;
- case FeedTypeAtom:
- r = bsearch(&f, atomtags, sizeof(atomtags) / sizeof(atomtags[0]),
- sizeof(atomtags[0]), tagcmp);
- break;
- default:
- break;
- }
- return r;
- }
- static char *
- ltrim(const char *s)
- {
- for (; ISSPACE((unsigned char)*s); s++)
- ;
- return (char *)s;
- }
- static char *
- rtrim(const char *s)
- {
- const char *e;
- for (e = s + strlen(s); e > s && ISSPACE((unsigned char)*(e - 1)); e--)
- ;
- return (char *)e;
- }
- /* Clear string only; don't free, prevents unnecessary reallocation. */
- static void
- string_clear(String *s)
- {
- if (s->data)
- s->data[0] = '\0';
- s->len = 0;
- }
- static void
- string_buffer_realloc(String *s, size_t newlen)
- {
- size_t alloclen;
- if (newlen > SIZE_MAX / 2) {
- alloclen = SIZE_MAX;
- } else {
- for (alloclen = 64; alloclen <= newlen; alloclen *= 2)
- ;
- }
- if (!(s->data = realloc(s->data, alloclen)))
- err(1, "realloc");
- s->bufsiz = alloclen;
- }
- /* Append data to String, s->data and data may not overlap. */
- static void
- string_append(String *s, const char *data, size_t len)
- {
- if (!len)
- return;
- if (s->len >= SIZE_MAX - len) {
- errno = EOVERFLOW;
- err(1, "realloc");
- }
- /* check if allocation is necessary, never shrink the buffer. */
- if (s->len + len >= s->bufsiz)
- string_buffer_realloc(s, s->len + len + 1);
- memcpy(s->data + s->len, data, len);
- s->len += len;
- s->data[s->len] = '\0';
- }
- /* Print text, encode TABs, newlines and '\', remove other whitespace.
- * Remove leading and trailing whitespace. */
- static void
- string_print_encoded(String *s)
- {
- const char *p, *e;
- if (!s->data || !s->len)
- return;
- p = ltrim(s->data);
- e = rtrim(p);
- for (; *p && p != e; p++) {
- switch (*p) {
- case '\n': putchar('\\'); putchar('n'); break;
- case '\\': putchar('\\'); putchar('\\'); break;
- case '\t': putchar('\\'); putchar('t'); break;
- default:
- /* ignore control chars */
- if (!ISCNTRL((unsigned char)*p))
- putchar(*p);
- break;
- }
- }
- }
- static void
- printtrimmed(const char *s)
- {
- char *p, *e;
- p = ltrim(s);
- e = rtrim(p);
- for (; *p && p != e; p++) {
- if (ISSPACE((unsigned char)*p))
- putchar(' '); /* any whitespace to space */
- else if (!ISCNTRL((unsigned char)*p))
- /* ignore other control chars */
- putchar(*p);
- }
- }
- /* Print text, replace TABs, carriage return and other whitespace with ' '.
- * Other control chars are removed. Remove leading and trailing whitespace. */
- static void
- string_print_trimmed(String *s)
- {
- if (!s->data || !s->len)
- return;
- printtrimmed(s->data);
- }
- /* Print each field with trimmed whitespace, separated by '|'. */
- static void
- string_print_trimmed_multi(String *s)
- {
- char *p, *e;
- int c;
- if (!s->data || !s->len)
- return;
- for (p = s->data; ; p = e + 1) {
- if ((e = strstr(p, FieldMultiSeparator))) {
- c = *e;
- *e = '\0';
- printtrimmed(p);
- *e = c; /* restore NUL byte to original character */
- fputs(FieldMultiSeparator, stdout);
- } else {
- printtrimmed(p);
- break;
- }
- }
- }
- /* Print URL, if it's a relative URL then it uses the global `baseurl`. */
- static void
- printuri(char *s)
- {
- char link[4096], *p, *e;
- struct uri newuri, olduri;
- int c, r = -1;
- p = ltrim(s);
- e = rtrim(p);
- c = *e;
- *e = '\0';
- if (baseurl && !uri_hasscheme(p) &&
- uri_parse(p, &olduri) != -1 && !olduri.proto[0] &&
- uri_makeabs(&newuri, &olduri, &baseuri) != -1 && newuri.proto[0])
- r = uri_format(link, sizeof(link), &newuri);
- if (r >= 0 && (size_t)r < sizeof(link))
- printtrimmed(link);
- else
- printtrimmed(p);
- *e = c; /* restore NUL byte to original character */
- }
- /* Print URL, if it's a relative URL then it uses the global `baseurl`. */
- static void
- string_print_uri(String *s)
- {
- if (!s->data || !s->len)
- return;
- printuri(s->data);
- }
- /* Print as UNIX timestamp, print nothing if the time is empty or invalid. */
- static void
- string_print_timestamp(String *s)
- {
- long long t;
- if (!s->data || !s->len)
- return;
- if (parsetime(s->data, &t) != -1)
- printf("%lld", t);
- }
- /* Convert time fields. Returns a UNIX timestamp. */
- static long long
- datetounix(long long year, int mon, int day, int hour, int min, int sec)
- {
- static const int secs_through_month[] = {
- 0, 31 * 86400, 59 * 86400, 90 * 86400,
- 120 * 86400, 151 * 86400, 181 * 86400, 212 * 86400,
- 243 * 86400, 273 * 86400, 304 * 86400, 334 * 86400 };
- int is_leap = 0, cycles, centuries = 0, leaps = 0, rem;
- long long t;
- if (year - 2ULL <= 136) {
- leaps = (year - 68) >> 2;
- if (!((year - 68) & 3)) {
- leaps--;
- is_leap = 1;
- } else {
- is_leap = 0;
- }
- t = 31536000 * (year - 70) + 86400 * leaps;
- } else {
- cycles = (year - 100) / 400;
- rem = (year - 100) % 400;
- if (rem < 0) {
- cycles--;
- rem += 400;
- }
- if (!rem) {
- is_leap = 1;
- } else {
- if (rem >= 300)
- centuries = 3, rem -= 300;
- else if (rem >= 200)
- centuries = 2, rem -= 200;
- else if (rem >= 100)
- centuries = 1, rem -= 100;
- if (rem) {
- leaps = rem / 4U;
- rem %= 4U;
- is_leap = !rem;
- }
- }
- leaps += 97 * cycles + 24 * centuries - is_leap;
- t = (year - 100) * 31536000LL + leaps * 86400LL + 946684800 + 86400;
- }
- t += secs_through_month[mon];
- if (is_leap && mon >= 2)
- t += 86400;
- t += 86400LL * (day - 1);
- t += 3600LL * hour;
- t += 60LL * min;
- t += sec;
- return t;
- }
- /* Get timezone from string, return time offset in seconds from UTC.
- * NOTE: only parses timezones in RFC-822, many other timezone names are
- * ambiguous anyway.
- * ANSI and military zones are defined wrong in RFC822 and are unsupported,
- * see note on RFC2822 4.3 page 32. */
- static long
- gettzoffset(const char *s)
- {
- static const struct {
- char *name;
- int offhour;
- } tzones[] = {
- { "CDT", -5 * 3600 },
- { "CST", -6 * 3600 },
- { "EDT", -4 * 3600 },
- { "EST", -5 * 3600 },
- { "MDT", -6 * 3600 },
- { "MST", -7 * 3600 },
- { "PDT", -7 * 3600 },
- { "PST", -8 * 3600 },
- };
- const char *p;
- long tzhour = 0, tzmin = 0;
- size_t i;
- for (; ISSPACE((unsigned char)*s); s++)
- ;
- switch (*s) {
- case '-': /* offset */
- case '+':
- for (i = 0, p = s + 1; i < 2 && ISDIGIT((unsigned char)*p); i++, p++)
- tzhour = (tzhour * 10) + (*p - '0');
- if (*p == ':')
- p++;
- for (i = 0; i < 2 && ISDIGIT((unsigned char)*p); i++, p++)
- tzmin = (tzmin * 10) + (*p - '0');
- return ((tzhour * 3600) + (tzmin * 60)) * (s[0] == '-' ? -1 : 1);
- default: /* timezone name */
- for (i = 0; ISALPHA((unsigned char)s[i]); i++)
- ;
- if (i != 3)
- return 0;
- /* compare timezone and adjust offset relative to UTC */
- for (i = 0; i < sizeof(tzones) / sizeof(*tzones); i++) {
- if (!memcmp(s, tzones[i].name, 3))
- return tzones[i].offhour;
- }
- }
- return 0;
- }
- /* Parse time string `s` into the UNIX timestamp `tp`.
- Returns 0 on success or -1 on failure. */
- static int
- parsetime(const char *s, long long *tp)
- {
- static const struct {
- char *name;
- int len;
- } mons[] = {
- { STRP("January"), },
- { STRP("February"), },
- { STRP("March"), },
- { STRP("April"), },
- { STRP("May"), },
- { STRP("June"), },
- { STRP("July"), },
- { STRP("August"), },
- { STRP("September"), },
- { STRP("October"), },
- { STRP("November"), },
- { STRP("December"), },
- };
- int va[6] = { 0 }, i, j, v, vi;
- size_t m;
- for (; ISSPACE((unsigned char)*s); s++)
- ;
- if (!ISDIGIT((unsigned char)*s) && !ISALPHA((unsigned char)*s))
- return -1;
- if (ISDIGIT((unsigned char)s[0]) &&
- ISDIGIT((unsigned char)s[1]) &&
- ISDIGIT((unsigned char)s[2]) &&
- ISDIGIT((unsigned char)s[3])) {
- /* formats "%Y-%m-%d %H:%M:%S", "%Y-%m-%dT%H:%M:%S" or "%Y%m%d%H%M%S" */
- vi = 0;
- } else {
- /* format: "[%a, ]%d %b %Y %H:%M:%S" */
- /* parse "[%a, ]%d %b %Y " part, then use time parsing as above */
- for (; ISALPHA((unsigned char)*s); s++)
- ;
- for (; ISSPACE((unsigned char)*s); s++)
- ;
- if (*s == ',')
- s++;
- for (; ISSPACE((unsigned char)*s); s++)
- ;
- for (v = 0, i = 0; i < 2 && ISDIGIT((unsigned char)*s); s++, i++)
- v = (v * 10) + (*s - '0');
- va[2] = v; /* day */
- for (; ISSPACE((unsigned char)*s); s++)
- ;
- /* end of word month */
- for (j = 0; ISALPHA((unsigned char)s[j]); j++)
- ;
- /* check month name */
- if (j < 3 || j > 9)
- return -1; /* month cannot match */
- for (m = 0; m < sizeof(mons) / sizeof(*mons); m++) {
- /* abbreviation (3 length) or long name */
- if ((j == 3 || j == mons[m].len) &&
- !strncasecmp(mons[m].name, s, j)) {
- va[1] = m + 1;
- s += j;
- break;
- }
- }
- if (m >= 12)
- return -1; /* no month found */
- for (; ISSPACE((unsigned char)*s); s++)
- ;
- for (v = 0, i = 0; i < 4 && ISDIGIT((unsigned char)*s); s++, i++)
- v = (v * 10) + (*s - '0');
- /* obsolete short year: RFC2822 4.3 */
- if (i <= 3)
- v += (v >= 0 && v <= 49) ? 2000 : 1900;
- va[0] = v; /* year */
- for (; ISSPACE((unsigned char)*s); s++)
- ;
- /* parse only regular time part, see below */
- vi = 3;
- }
- /* parse time parts (and possibly remaining date parts) */
- for (; *s && vi < 6; vi++) {
- for (i = 0, v = 0; i < ((vi == 0) ? 4 : 2) &&
- ISDIGIT((unsigned char)*s); s++, i++) {
- v = (v * 10) + (*s - '0');
- }
- va[vi] = v;
- if ((vi < 2 && *s == '-') ||
- (vi == 2 && (*s == 'T' || ISSPACE((unsigned char)*s))) ||
- (vi > 2 && *s == ':'))
- s++;
- }
- /* skip milliseconds in for example: "%Y-%m-%dT%H:%M:%S.000Z" */
- if (*s == '.') {
- for (s++; ISDIGIT((unsigned char)*s); s++)
- ;
- }
- /* invalid range */
- if (va[0] < 0 || va[0] > 9999 ||
- va[1] < 1 || va[1] > 12 ||
- va[2] < 1 || va[2] > 31 ||
- va[3] < 0 || va[3] > 23 ||
- va[4] < 0 || va[4] > 59 ||
- va[5] < 0 || va[5] > 60) /* allow leap second */
- return -1;
- *tp = datetounix(va[0] - 1900, va[1] - 1, va[2], va[3], va[4], va[5]) -
- gettzoffset(s);
- return 0;
- }
- static void
- printfields(void)
- {
- string_print_timestamp(&ctx.fields[FeedFieldTime].str);
- putchar(FieldSeparator);
- string_print_trimmed(&ctx.fields[FeedFieldTitle].str);
- putchar(FieldSeparator);
- string_print_uri(&ctx.fields[FeedFieldLink].str);
- putchar(FieldSeparator);
- string_print_encoded(&ctx.fields[FeedFieldContent].str);
- putchar(FieldSeparator);
- fputs(contenttypes[ctx.contenttype], stdout);
- putchar(FieldSeparator);
- string_print_trimmed(&ctx.fields[FeedFieldId].str);
- putchar(FieldSeparator);
- string_print_trimmed(&ctx.fields[FeedFieldAuthor].str);
- putchar(FieldSeparator);
- string_print_uri(&ctx.fields[FeedFieldEnclosure].str);
- putchar(FieldSeparator);
- string_print_trimmed_multi(&ctx.fields[FeedFieldCategory].str);
- putchar('\n');
- if (ferror(stdout)) /* check for errors but do not flush */
- checkfileerror(stdout, "<stdout>", 'w');
- }
- static int
- istag(const char *name, size_t len, const char *name2, size_t len2)
- {
- return (len == len2 && !strcasecmp(name, name2));
- }
- static int
- isattr(const char *name, size_t len, const char *name2, size_t len2)
- {
- return (len == len2 && !strcasecmp(name, name2));
- }
- static void
- xmlattr(XMLParser *p, const char *t, size_t tl, const char *n, size_t nl,
- const char *v, size_t vl)
- {
- /* handles transforming inline XML to data */
- if (ISINCONTENT(ctx)) {
- if (ctx.contenttype == ContentTypeHTML)
- xmldata(p, v, vl);
- return;
- }
- if (!ctx.tag.id)
- return;
- /* content-type may be: Atom: text, xhtml, html or mime-type.
- MRSS (media:description): plain, html. */
- if (ISCONTENTTAG(ctx)) {
- if (isattr(n, nl, STRP("type")))
- string_append(&attrtype, v, vl);
- return;
- }
- if (ctx.feedtype == FeedTypeRSS) {
- if (ctx.tag.id == RSSTagEnclosure &&
- isattr(n, nl, STRP("url"))) {
- string_append(&tmpstr, v, vl);
- } else if (ctx.tag.id == RSSTagGuid &&
- isattr(n, nl, STRP("ispermalink"))) {
- string_append(&attrispermalink, v, vl);
- }
- } else if (ctx.feedtype == FeedTypeAtom) {
- if (ctx.tag.id == AtomTagLink) {
- if (isattr(n, nl, STRP("rel"))) {
- string_append(&attrrel, v, vl);
- } else if (isattr(n, nl, STRP("href"))) {
- string_append(&tmpstr, v, vl);
- }
- } else if (ctx.tag.id == AtomTagCategory &&
- isattr(n, nl, STRP("term"))) {
- string_append(&tmpstr, v, vl);
- }
- }
- }
- static void
- xmlattrentity(XMLParser *p, const char *t, size_t tl, const char *n, size_t nl,
- const char *data, size_t datalen)
- {
- char buf[16];
- int len;
- /* handles transforming inline XML to data */
- if (ISINCONTENT(ctx)) {
- if (ctx.contenttype == ContentTypeHTML)
- xmldata(p, data, datalen);
- return;
- }
- if (!ctx.tag.id)
- return;
- /* try to translate entity, else just pass as data to
- * xmlattr handler. */
- if ((len = xml_entitytostr(data, buf, sizeof(buf))) > 0)
- xmlattr(p, t, tl, n, nl, buf, (size_t)len);
- else
- xmlattr(p, t, tl, n, nl, data, datalen);
- }
- static void
- xmlattrend(XMLParser *p, const char *t, size_t tl, const char *n, size_t nl)
- {
- if (ISINCONTENT(ctx)) {
- if (ctx.contenttype == ContentTypeHTML) {
- /* handles transforming inline XML to data */
- xmldata(p, "\"", 1);
- ctx.attrcount = 0;
- }
- return;
- }
- }
- static void
- xmlattrstart(XMLParser *p, const char *t, size_t tl, const char *n, size_t nl)
- {
- if (ISINCONTENT(ctx)) {
- if (ctx.contenttype == ContentTypeHTML) {
- /* handles transforming inline XML to data */
- if (!ctx.attrcount)
- xmldata(p, " ", 1);
- ctx.attrcount++;
- xmldata(p, n, nl);
- xmldata(p, "=\"", 2);
- }
- return;
- }
- if (attrispermalink.len && isattr(n, nl, STRP("ispermalink")))
- string_clear(&attrispermalink);
- else if (attrrel.len && isattr(n, nl, STRP("rel")))
- string_clear(&attrrel);
- else if (attrtype.len && isattr(n, nl, STRP("type")))
- string_clear(&attrtype);
- else if (tmpstr.len &&
- (isattr(n, nl, STRP("href")) ||
- isattr(n, nl, STRP("term")) ||
- isattr(n, nl, STRP("url"))))
- string_clear(&tmpstr); /* use the last value for multiple attribute values */
- }
- static void
- xmldata(XMLParser *p, const char *s, size_t len)
- {
- if (!ctx.field)
- return;
- if (ISFEEDFIELDMULTI(fieldmap[ctx.tag.id]))
- string_append(&tmpstr, s, len);
- else
- string_append(ctx.field, s, len);
- }
- static void
- xmldataentity(XMLParser *p, const char *data, size_t datalen)
- {
- char buf[16];
- int len;
- if (!ctx.field)
- return;
- /* try to translate entity, else just pass as data to
- * xmldata handler. */
- if ((len = xml_entitytostr(data, buf, sizeof(buf))) > 0)
- xmldata(p, buf, (size_t)len);
- else
- xmldata(p, data, datalen);
- }
- static void
- xmltagstart(XMLParser *p, const char *t, size_t tl)
- {
- const FeedTag *f;
- if (ISINCONTENT(ctx)) {
- if (ctx.contenttype == ContentTypeHTML) {
- ctx.attrcount = 0;
- xmldata(p, "<", 1);
- xmldata(p, t, tl);
- }
- return;
- }
- /* start of RSS or Atom item / entry */
- if (ctx.feedtype == FeedTypeNone) {
- if (istag(t, tl, STRP("entry")))
- ctx.feedtype = FeedTypeAtom;
- else if (istag(t, tl, STRP("item")))
- ctx.feedtype = FeedTypeRSS;
- return;
- }
- /* field tagid already set or nested tags. */
- if (ctx.tag.id) {
- /* nested <author><name> for Atom */
- if (ctx.tag.id == AtomTagAuthor &&
- istag(t, tl, STRP("name"))) {
- memcpy(&(ctx.tag), &atomtagauthorname, sizeof(ctx.tag));
- } else {
- return; /* other nested tags are not allowed: return */
- }
- }
- /* in item */
- if (ctx.tag.id == TagUnknown) {
- if (!(f = gettag(ctx.feedtype, t, tl)))
- f = ¬ag;
- memcpy(&(ctx.tag), f, sizeof(ctx.tag));
- }
- ctx.iscontenttag = (fieldmap[ctx.tag.id] == FeedFieldContent);
- string_clear(&attrispermalink);
- string_clear(&attrrel);
- string_clear(&attrtype);
- }
- static void
- xmltagstartparsed(XMLParser *p, const char *t, size_t tl, int isshort)
- {
- enum TagId tagid;
- if (ISINCONTENT(ctx)) {
- if (ctx.contenttype == ContentTypeHTML) {
- if (isshort)
- xmldata(p, "/>", 2);
- else
- xmldata(p, ">", 1);
- }
- return;
- }
- /* set tag type based on it's attribute value */
- if (ctx.tag.id == RSSTagGuid) {
- /* if empty the default is "true" */
- if (!attrispermalink.len ||
- isattr(attrispermalink.data, attrispermalink.len, STRP("true")))
- ctx.tag.id = RSSTagGuidPermalinkTrue;
- else
- ctx.tag.id = RSSTagGuidPermalinkFalse;
- } else if (ctx.tag.id == AtomTagLink) {
- /* empty or "alternate": other types could be
- "enclosure", "related", "self" or "via" */
- if (!attrrel.len || isattr(attrrel.data, attrrel.len, STRP("alternate")))
- ctx.tag.id = AtomTagLinkAlternate;
- else if (isattr(attrrel.data, attrrel.len, STRP("enclosure")))
- ctx.tag.id = AtomTagLinkEnclosure;
- else
- ctx.tag.id = AtomTagLink; /* unknown */
- }
- tagid = ctx.tag.id;
- /* map tag type to field: unknown or lesser priority is ignored,
- when tags of the same type are repeated only the first is used. */
- if (fieldmap[tagid] == -1 ||
- (!ISFEEDFIELDMULTI(fieldmap[tagid]) &&
- tagid <= ctx.fields[fieldmap[tagid]].tagid)) {
- return;
- }
- if (ctx.iscontenttag) {
- ctx.iscontent = 1;
- ctx.iscontenttag = 0;
- /* detect content-type based on type attribute */
- if (attrtype.len) {
- if (isattr(attrtype.data, attrtype.len, STRP("html")) ||
- isattr(attrtype.data, attrtype.len, STRP("xhtml")) ||
- isattr(attrtype.data, attrtype.len, STRP("text/html")) ||
- isattr(attrtype.data, attrtype.len, STRP("text/xhtml")) ||
- isattr(attrtype.data, attrtype.len, STRP("application/xhtml+xml")))
- ctx.contenttype = ContentTypeHTML;
- else /* unknown: handle as base64 text data */
- ctx.contenttype = ContentTypePlain;
- } else {
- /* default content-type */
- if (tagid == RSSTagContentEncoded || tagid == RSSTagDescription)
- ctx.contenttype = ContentTypeHTML;
- else
- ctx.contenttype = ContentTypePlain;
- }
- }
- ctx.field = &(ctx.fields[fieldmap[tagid]].str);
- ctx.fields[fieldmap[tagid]].tagid = tagid;
- /* clear field if it is overwritten (with a priority order) for the new
- value, if the field can have multiple values then do not clear it. */
- if (!ISFEEDFIELDMULTI(fieldmap[ctx.tag.id]))
- string_clear(ctx.field);
- }
- static void
- xmltagend(XMLParser *p, const char *t, size_t tl, int isshort)
- {
- size_t i;
- if (ctx.feedtype == FeedTypeNone)
- return;
- if (ISINCONTENT(ctx)) {
- /* not a closed content field */
- if (!istag(ctx.tag.name, ctx.tag.len, t, tl)) {
- if (!isshort && ctx.contenttype == ContentTypeHTML) {
- xmldata(p, "</", 2);
- xmldata(p, t, tl);
- xmldata(p, ">", 1);
- }
- return;
- }
- } else if (ctx.tag.id && istag(ctx.tag.name, ctx.tag.len, t, tl)) {
- /* matched tag end: close it */
- /* copy also to the link field if the attribute isPermaLink="true"
- and it is not set by a tag with higher priority. */
- if (ctx.tag.id == RSSTagGuidPermalinkTrue && ctx.field &&
- ctx.tag.id > ctx.fields[FeedFieldLink].tagid) {
- string_clear(&ctx.fields[FeedFieldLink].str);
- string_append(&ctx.fields[FeedFieldLink].str,
- ctx.field->data, ctx.field->len);
- ctx.fields[FeedFieldLink].tagid = ctx.tag.id;
- }
- } else if (!ctx.tag.id && ((ctx.feedtype == FeedTypeAtom &&
- istag(t, tl, STRP("entry"))) || /* Atom */
- (ctx.feedtype == FeedTypeRSS &&
- istag(t, tl, STRP("item"))))) /* RSS */
- {
- /* end of RSS or Atom entry / item */
- printfields();
- /* clear strings */
- for (i = 0; i < FeedFieldLast; i++) {
- string_clear(&ctx.fields[i].str);
- ctx.fields[i].tagid = TagUnknown;
- }
- ctx.contenttype = ContentTypeNone;
- /* allow parsing of Atom and RSS concatenated in one XML stream. */
- ctx.feedtype = FeedTypeNone;
- } else {
- return; /* not end of field */
- }
- /* temporary string: for fields that cannot be processed
- directly and need more context, for example by it's tag
- attributes, like the Atom link rel="alternate|enclosure". */
- if (tmpstr.len && ctx.field) {
- if (ISFEEDFIELDMULTI(fieldmap[ctx.tag.id])) {
- if (ctx.field->len)
- string_append(ctx.field, FieldMultiSeparator, 1);
- string_append(ctx.field, tmpstr.data, tmpstr.len);
- } else {
- string_clear(ctx.field);
- string_append(ctx.field, tmpstr.data, tmpstr.len);
- }
- }
- /* close field */
- string_clear(&tmpstr); /* reuse and clear temporary string */
- if (ctx.tag.id == AtomTagAuthorName)
- memcpy(&(ctx.tag), &atomtagauthor, sizeof(ctx.tag)); /* outer tag */
- else
- memcpy(&(ctx.tag), ¬ag, sizeof(ctx.tag));
- ctx.iscontent = 0;
- ctx.field = NULL;
- }
- int
- main(int argc, char *argv[])
- {
- if (pledge("stdio", NULL) == -1)
- err(1, "pledge");
- if (argc > 1) {
- if (uri_parse(argv[1], &baseuri) != -1 && baseuri.proto[0])
- baseurl = argv[1];
- else
- errx(1, "baseurl incorrect or too long");
- }
- memcpy(&(ctx.tag), ¬ag, sizeof(ctx.tag));
- parser.xmlattr = xmlattr;
- parser.xmlattrentity = xmlattrentity;
- parser.xmlattrend = xmlattrend;
- parser.xmlattrstart = xmlattrstart;
- parser.xmlcdata = xmldata;
- parser.xmldata = xmldata;
- parser.xmldataentity = xmldataentity;
- parser.xmltagend = xmltagend;
- parser.xmltagstart = xmltagstart;
- parser.xmltagstartparsed = xmltagstartparsed;
- /* NOTE: getnext is defined in xml.h for inline optimization */
- xml_parse(&parser);
- checkfileerror(stdin, "<stdin>", 'r');
- checkfileerror(stdout, "<stdout>", 'w');
- return 0;
- }
|