sfeed.c 29 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069
  1. #include <errno.h>
  2. #include <stdint.h>
  3. #include <stdio.h>
  4. #include <stdlib.h>
  5. #include <string.h>
  6. #include <strings.h>
  7. #include "util.h"
  8. #include "xml.h"
  9. #define ISINCONTENT(ctx) ((ctx).iscontent && !((ctx).iscontenttag))
  10. #define ISCONTENTTAG(ctx) (!((ctx).iscontent) && (ctx).iscontenttag)
  11. /* these feed fields support multiple separated values */
  12. #define ISFEEDFIELDMULTI(t) ((t) == FeedFieldCategory)
  13. /* string and byte-length */
  14. #define STRP(s) s,sizeof(s)-1
  15. enum FeedType {
  16. FeedTypeNone = 0,
  17. FeedTypeRSS = 1,
  18. FeedTypeAtom = 2
  19. };
  20. enum ContentType {
  21. ContentTypeNone = 0,
  22. ContentTypePlain = 1,
  23. ContentTypeHTML = 2
  24. };
  25. static const char *contenttypes[] = { "", "plain", "html" };
  26. /* String data / memory pool */
  27. typedef struct string {
  28. char *data; /* data */
  29. size_t len; /* string length */
  30. size_t bufsiz; /* allocated size */
  31. } String;
  32. /* NOTE: the order of these fields (content, date, author) indicate the
  33. * priority to use them, from least important to high. */
  34. enum TagId {
  35. TagUnknown = 0,
  36. /* RSS */
  37. RSSTagDcdate, RSSTagPubdate, /* creation date has higher priority */
  38. RSSTagTitle,
  39. RSSTagMediaDescription, RSSTagDescription, RSSTagContentEncoded,
  40. RSSTagGuid,
  41. RSSTagGuidPermalinkFalse,
  42. RSSTagGuidPermalinkTrue,
  43. /* must be defined after GUID, because it can be a link (isPermaLink) */
  44. RSSTagLink,
  45. RSSTagEnclosure,
  46. RSSTagAuthor, RSSTagDccreator,
  47. RSSTagCategory,
  48. /* Atom */
  49. /* creation date has higher priority */
  50. AtomTagModified, AtomTagUpdated, AtomTagIssued, AtomTagPublished,
  51. AtomTagTitle,
  52. AtomTagMediaDescription, AtomTagSummary, AtomTagContent,
  53. AtomTagId,
  54. AtomTagLink,
  55. AtomTagLinkAlternate,
  56. AtomTagLinkEnclosure,
  57. AtomTagAuthor, AtomTagAuthorName,
  58. AtomTagCategory,
  59. TagLast
  60. };
  61. typedef struct feedtag {
  62. char *name; /* name of tag to match */
  63. size_t len; /* len of `name` */
  64. enum TagId id; /* unique ID */
  65. } FeedTag;
  66. typedef struct field {
  67. String str;
  68. enum TagId tagid; /* tagid set previously, used for tag priority */
  69. } FeedField;
  70. enum {
  71. FeedFieldTime = 0, FeedFieldTitle, FeedFieldLink, FeedFieldContent,
  72. FeedFieldId, FeedFieldAuthor, FeedFieldEnclosure, FeedFieldCategory,
  73. FeedFieldLast
  74. };
  75. typedef struct feedcontext {
  76. String *field; /* current FeedItem field String */
  77. FeedField fields[FeedFieldLast]; /* data for current item */
  78. FeedTag tag; /* unique current parsed tag */
  79. int iscontent; /* in content data */
  80. int iscontenttag; /* in content tag */
  81. enum ContentType contenttype; /* content-type for item */
  82. enum FeedType feedtype;
  83. int attrcount; /* count item HTML element attributes */
  84. } FeedContext;
  85. static long long datetounix(long long, int, int, int, int, int);
  86. static FeedTag * gettag(enum FeedType, const char *, size_t);
  87. static long gettzoffset(const char *);
  88. static int isattr(const char *, size_t, const char *, size_t);
  89. static int istag(const char *, size_t, const char *, size_t);
  90. static int parsetime(const char *, long long *);
  91. static void printfields(void);
  92. static void string_append(String *, const char *, size_t);
  93. static void string_buffer_realloc(String *, size_t);
  94. static void string_clear(String *);
  95. static void string_print_encoded(String *);
  96. static void string_print_timestamp(String *);
  97. static void string_print_trimmed(String *);
  98. static void string_print_trimmed_multi(String *);
  99. static void string_print_uri(String *);
  100. static void xmlattr(XMLParser *, const char *, size_t, const char *, size_t,
  101. const char *, size_t);
  102. static void xmlattrentity(XMLParser *, const char *, size_t, const char *,
  103. size_t, const char *, size_t);
  104. static void xmlattrend(XMLParser *, const char *, size_t, const char *,
  105. size_t);
  106. static void xmlattrstart(XMLParser *, const char *, size_t, const char *,
  107. size_t);
  108. static void xmldata(XMLParser *, const char *, size_t);
  109. static void xmldataentity(XMLParser *, const char *, size_t);
  110. static void xmltagend(XMLParser *, const char *, size_t, int);
  111. static void xmltagstart(XMLParser *, const char *, size_t);
  112. static void xmltagstartparsed(XMLParser *, const char *, size_t, int);
  113. /* map tag name to TagId type */
  114. /* RSS, must be alphabetical order */
  115. static const FeedTag rsstags[] = {
  116. { STRP("author"), RSSTagAuthor },
  117. { STRP("category"), RSSTagCategory },
  118. { STRP("content:encoded"), RSSTagContentEncoded },
  119. { STRP("dc:creator"), RSSTagDccreator },
  120. { STRP("dc:date"), RSSTagDcdate },
  121. { STRP("description"), RSSTagDescription },
  122. /* RSS: <enclosure url="" />, Atom has <link rel="enclosure" /> */
  123. { STRP("enclosure"), RSSTagEnclosure },
  124. { STRP("guid"), RSSTagGuid },
  125. { STRP("link"), RSSTagLink },
  126. { STRP("media:description"), RSSTagMediaDescription },
  127. { STRP("pubdate"), RSSTagPubdate },
  128. { STRP("title"), RSSTagTitle }
  129. };
  130. /* Atom, must be alphabetical order */
  131. static const FeedTag atomtags[] = {
  132. { STRP("author"), AtomTagAuthor },
  133. { STRP("category"), AtomTagCategory },
  134. { STRP("content"), AtomTagContent },
  135. { STRP("id"), AtomTagId },
  136. { STRP("issued"), AtomTagIssued }, /* Atom 0.3 */
  137. /* Atom: <link href="" />, RSS has <link></link> */
  138. { STRP("link"), AtomTagLink },
  139. { STRP("media:description"), AtomTagMediaDescription },
  140. { STRP("modified"), AtomTagModified }, /* Atom 0.3 */
  141. { STRP("published"), AtomTagPublished },
  142. { STRP("summary"), AtomTagSummary },
  143. { STRP("title"), AtomTagTitle },
  144. { STRP("updated"), AtomTagUpdated }
  145. };
  146. /* special case: nested <author><name> */
  147. static const FeedTag atomtagauthor = { STRP("author"), AtomTagAuthor };
  148. static const FeedTag atomtagauthorname = { STRP("name"), AtomTagAuthorName };
  149. /* reference to no / unknown tag */
  150. static const FeedTag notag = { STRP(""), TagUnknown };
  151. /* map TagId type to RSS/Atom field, all tags must be defined */
  152. static const int fieldmap[TagLast] = {
  153. [TagUnknown] = -1,
  154. /* RSS */
  155. [RSSTagDcdate] = FeedFieldTime,
  156. [RSSTagPubdate] = FeedFieldTime,
  157. [RSSTagTitle] = FeedFieldTitle,
  158. [RSSTagMediaDescription] = FeedFieldContent,
  159. [RSSTagDescription] = FeedFieldContent,
  160. [RSSTagContentEncoded] = FeedFieldContent,
  161. [RSSTagGuid] = -1,
  162. [RSSTagGuidPermalinkFalse] = FeedFieldId,
  163. [RSSTagGuidPermalinkTrue] = FeedFieldId, /* special-case: both a link and an id */
  164. [RSSTagLink] = FeedFieldLink,
  165. [RSSTagEnclosure] = FeedFieldEnclosure,
  166. [RSSTagAuthor] = FeedFieldAuthor,
  167. [RSSTagDccreator] = FeedFieldAuthor,
  168. [RSSTagCategory] = FeedFieldCategory,
  169. /* Atom */
  170. [AtomTagModified] = FeedFieldTime,
  171. [AtomTagUpdated] = FeedFieldTime,
  172. [AtomTagIssued] = FeedFieldTime,
  173. [AtomTagPublished] = FeedFieldTime,
  174. [AtomTagTitle] = FeedFieldTitle,
  175. [AtomTagMediaDescription] = FeedFieldContent,
  176. [AtomTagSummary] = FeedFieldContent,
  177. [AtomTagContent] = FeedFieldContent,
  178. [AtomTagId] = FeedFieldId,
  179. [AtomTagLink] = -1,
  180. [AtomTagLinkAlternate] = FeedFieldLink,
  181. [AtomTagLinkEnclosure] = FeedFieldEnclosure,
  182. [AtomTagAuthor] = -1,
  183. [AtomTagAuthorName] = FeedFieldAuthor,
  184. [AtomTagCategory] = FeedFieldCategory
  185. };
  186. static const int FieldSeparator = '\t';
  187. /* separator for multiple values in a field, separator should be 1 byte */
  188. static const char FieldMultiSeparator[] = "|";
  189. static struct uri baseuri;
  190. static const char *baseurl;
  191. static FeedContext ctx;
  192. static XMLParser parser; /* XML parser state */
  193. static String attrispermalink, attrrel, attrtype, tmpstr;
  194. static int
  195. tagcmp(const void *v1, const void *v2)
  196. {
  197. return strcasecmp(((FeedTag *)v1)->name, ((FeedTag *)v2)->name);
  198. }
  199. /* Unique tagid for parsed tag name. */
  200. static FeedTag *
  201. gettag(enum FeedType feedtype, const char *name, size_t namelen)
  202. {
  203. FeedTag f, *r = NULL;
  204. f.name = (char *)name;
  205. switch (feedtype) {
  206. case FeedTypeRSS:
  207. r = bsearch(&f, rsstags, sizeof(rsstags) / sizeof(rsstags[0]),
  208. sizeof(rsstags[0]), tagcmp);
  209. break;
  210. case FeedTypeAtom:
  211. r = bsearch(&f, atomtags, sizeof(atomtags) / sizeof(atomtags[0]),
  212. sizeof(atomtags[0]), tagcmp);
  213. break;
  214. default:
  215. break;
  216. }
  217. return r;
  218. }
  219. static char *
  220. ltrim(const char *s)
  221. {
  222. for (; ISSPACE((unsigned char)*s); s++)
  223. ;
  224. return (char *)s;
  225. }
  226. static char *
  227. rtrim(const char *s)
  228. {
  229. const char *e;
  230. for (e = s + strlen(s); e > s && ISSPACE((unsigned char)*(e - 1)); e--)
  231. ;
  232. return (char *)e;
  233. }
  234. /* Clear string only; don't free, prevents unnecessary reallocation. */
  235. static void
  236. string_clear(String *s)
  237. {
  238. if (s->data)
  239. s->data[0] = '\0';
  240. s->len = 0;
  241. }
  242. static void
  243. string_buffer_realloc(String *s, size_t newlen)
  244. {
  245. size_t alloclen;
  246. if (newlen > SIZE_MAX / 2) {
  247. alloclen = SIZE_MAX;
  248. } else {
  249. for (alloclen = 64; alloclen <= newlen; alloclen *= 2)
  250. ;
  251. }
  252. if (!(s->data = realloc(s->data, alloclen)))
  253. err(1, "realloc");
  254. s->bufsiz = alloclen;
  255. }
  256. /* Append data to String, s->data and data may not overlap. */
  257. static void
  258. string_append(String *s, const char *data, size_t len)
  259. {
  260. if (!len)
  261. return;
  262. if (s->len >= SIZE_MAX - len) {
  263. errno = EOVERFLOW;
  264. err(1, "realloc");
  265. }
  266. /* check if allocation is necessary, never shrink the buffer. */
  267. if (s->len + len >= s->bufsiz)
  268. string_buffer_realloc(s, s->len + len + 1);
  269. memcpy(s->data + s->len, data, len);
  270. s->len += len;
  271. s->data[s->len] = '\0';
  272. }
  273. /* Print text, encode TABs, newlines and '\', remove other whitespace.
  274. * Remove leading and trailing whitespace. */
  275. static void
  276. string_print_encoded(String *s)
  277. {
  278. const char *p, *e;
  279. if (!s->data || !s->len)
  280. return;
  281. p = ltrim(s->data);
  282. e = rtrim(p);
  283. for (; *p && p != e; p++) {
  284. switch (*p) {
  285. case '\n': putchar('\\'); putchar('n'); break;
  286. case '\\': putchar('\\'); putchar('\\'); break;
  287. case '\t': putchar('\\'); putchar('t'); break;
  288. default:
  289. /* ignore control chars */
  290. if (!ISCNTRL((unsigned char)*p))
  291. putchar(*p);
  292. break;
  293. }
  294. }
  295. }
  296. static void
  297. printtrimmed(const char *s)
  298. {
  299. char *p, *e;
  300. p = ltrim(s);
  301. e = rtrim(p);
  302. for (; *p && p != e; p++) {
  303. if (ISSPACE((unsigned char)*p))
  304. putchar(' '); /* any whitespace to space */
  305. else if (!ISCNTRL((unsigned char)*p))
  306. /* ignore other control chars */
  307. putchar(*p);
  308. }
  309. }
  310. /* Print text, replace TABs, carriage return and other whitespace with ' '.
  311. * Other control chars are removed. Remove leading and trailing whitespace. */
  312. static void
  313. string_print_trimmed(String *s)
  314. {
  315. if (!s->data || !s->len)
  316. return;
  317. printtrimmed(s->data);
  318. }
  319. /* Print each field with trimmed whitespace, separated by '|'. */
  320. static void
  321. string_print_trimmed_multi(String *s)
  322. {
  323. char *p, *e;
  324. int c;
  325. if (!s->data || !s->len)
  326. return;
  327. for (p = s->data; ; p = e + 1) {
  328. if ((e = strstr(p, FieldMultiSeparator))) {
  329. c = *e;
  330. *e = '\0';
  331. printtrimmed(p);
  332. *e = c; /* restore NUL byte to original character */
  333. fputs(FieldMultiSeparator, stdout);
  334. } else {
  335. printtrimmed(p);
  336. break;
  337. }
  338. }
  339. }
  340. /* Print URL, if it's a relative URL then it uses the global `baseurl`. */
  341. static void
  342. printuri(char *s)
  343. {
  344. char link[4096], *p, *e;
  345. struct uri newuri, olduri;
  346. int c, r = -1;
  347. p = ltrim(s);
  348. e = rtrim(p);
  349. c = *e;
  350. *e = '\0';
  351. if (baseurl && !uri_hasscheme(p) &&
  352. uri_parse(p, &olduri) != -1 && !olduri.proto[0] &&
  353. uri_makeabs(&newuri, &olduri, &baseuri) != -1 && newuri.proto[0])
  354. r = uri_format(link, sizeof(link), &newuri);
  355. if (r >= 0 && (size_t)r < sizeof(link))
  356. printtrimmed(link);
  357. else
  358. printtrimmed(p);
  359. *e = c; /* restore NUL byte to original character */
  360. }
  361. /* Print URL, if it's a relative URL then it uses the global `baseurl`. */
  362. static void
  363. string_print_uri(String *s)
  364. {
  365. if (!s->data || !s->len)
  366. return;
  367. printuri(s->data);
  368. }
  369. /* Print as UNIX timestamp, print nothing if the time is empty or invalid. */
  370. static void
  371. string_print_timestamp(String *s)
  372. {
  373. long long t;
  374. if (!s->data || !s->len)
  375. return;
  376. if (parsetime(s->data, &t) != -1)
  377. printf("%lld", t);
  378. }
  379. /* Convert time fields. Returns a UNIX timestamp. */
  380. static long long
  381. datetounix(long long year, int mon, int day, int hour, int min, int sec)
  382. {
  383. static const int secs_through_month[] = {
  384. 0, 31 * 86400, 59 * 86400, 90 * 86400,
  385. 120 * 86400, 151 * 86400, 181 * 86400, 212 * 86400,
  386. 243 * 86400, 273 * 86400, 304 * 86400, 334 * 86400 };
  387. int is_leap = 0, cycles, centuries = 0, leaps = 0, rem;
  388. long long t;
  389. if (year - 2ULL <= 136) {
  390. leaps = (year - 68) >> 2;
  391. if (!((year - 68) & 3)) {
  392. leaps--;
  393. is_leap = 1;
  394. } else {
  395. is_leap = 0;
  396. }
  397. t = 31536000 * (year - 70) + 86400 * leaps;
  398. } else {
  399. cycles = (year - 100) / 400;
  400. rem = (year - 100) % 400;
  401. if (rem < 0) {
  402. cycles--;
  403. rem += 400;
  404. }
  405. if (!rem) {
  406. is_leap = 1;
  407. } else {
  408. if (rem >= 300)
  409. centuries = 3, rem -= 300;
  410. else if (rem >= 200)
  411. centuries = 2, rem -= 200;
  412. else if (rem >= 100)
  413. centuries = 1, rem -= 100;
  414. if (rem) {
  415. leaps = rem / 4U;
  416. rem %= 4U;
  417. is_leap = !rem;
  418. }
  419. }
  420. leaps += 97 * cycles + 24 * centuries - is_leap;
  421. t = (year - 100) * 31536000LL + leaps * 86400LL + 946684800 + 86400;
  422. }
  423. t += secs_through_month[mon];
  424. if (is_leap && mon >= 2)
  425. t += 86400;
  426. t += 86400LL * (day - 1);
  427. t += 3600LL * hour;
  428. t += 60LL * min;
  429. t += sec;
  430. return t;
  431. }
  432. /* Get timezone from string, return time offset in seconds from UTC.
  433. * NOTE: only parses timezones in RFC-822, many other timezone names are
  434. * ambiguous anyway.
  435. * ANSI and military zones are defined wrong in RFC822 and are unsupported,
  436. * see note on RFC2822 4.3 page 32. */
  437. static long
  438. gettzoffset(const char *s)
  439. {
  440. static const struct {
  441. char *name;
  442. int offhour;
  443. } tzones[] = {
  444. { "CDT", -5 * 3600 },
  445. { "CST", -6 * 3600 },
  446. { "EDT", -4 * 3600 },
  447. { "EST", -5 * 3600 },
  448. { "MDT", -6 * 3600 },
  449. { "MST", -7 * 3600 },
  450. { "PDT", -7 * 3600 },
  451. { "PST", -8 * 3600 },
  452. };
  453. const char *p;
  454. long tzhour = 0, tzmin = 0;
  455. size_t i;
  456. for (; ISSPACE((unsigned char)*s); s++)
  457. ;
  458. switch (*s) {
  459. case '-': /* offset */
  460. case '+':
  461. for (i = 0, p = s + 1; i < 2 && ISDIGIT((unsigned char)*p); i++, p++)
  462. tzhour = (tzhour * 10) + (*p - '0');
  463. if (*p == ':')
  464. p++;
  465. for (i = 0; i < 2 && ISDIGIT((unsigned char)*p); i++, p++)
  466. tzmin = (tzmin * 10) + (*p - '0');
  467. return ((tzhour * 3600) + (tzmin * 60)) * (s[0] == '-' ? -1 : 1);
  468. default: /* timezone name */
  469. for (i = 0; ISALPHA((unsigned char)s[i]); i++)
  470. ;
  471. if (i != 3)
  472. return 0;
  473. /* compare timezone and adjust offset relative to UTC */
  474. for (i = 0; i < sizeof(tzones) / sizeof(*tzones); i++) {
  475. if (!memcmp(s, tzones[i].name, 3))
  476. return tzones[i].offhour;
  477. }
  478. }
  479. return 0;
  480. }
  481. /* Parse time string `s` into the UNIX timestamp `tp`.
  482. Returns 0 on success or -1 on failure. */
  483. static int
  484. parsetime(const char *s, long long *tp)
  485. {
  486. static const struct {
  487. char *name;
  488. int len;
  489. } mons[] = {
  490. { STRP("January"), },
  491. { STRP("February"), },
  492. { STRP("March"), },
  493. { STRP("April"), },
  494. { STRP("May"), },
  495. { STRP("June"), },
  496. { STRP("July"), },
  497. { STRP("August"), },
  498. { STRP("September"), },
  499. { STRP("October"), },
  500. { STRP("November"), },
  501. { STRP("December"), },
  502. };
  503. int va[6] = { 0 }, i, j, v, vi;
  504. size_t m;
  505. for (; ISSPACE((unsigned char)*s); s++)
  506. ;
  507. if (!ISDIGIT((unsigned char)*s) && !ISALPHA((unsigned char)*s))
  508. return -1;
  509. if (ISDIGIT((unsigned char)s[0]) &&
  510. ISDIGIT((unsigned char)s[1]) &&
  511. ISDIGIT((unsigned char)s[2]) &&
  512. ISDIGIT((unsigned char)s[3])) {
  513. /* formats "%Y-%m-%d %H:%M:%S", "%Y-%m-%dT%H:%M:%S" or "%Y%m%d%H%M%S" */
  514. vi = 0;
  515. } else {
  516. /* format: "[%a, ]%d %b %Y %H:%M:%S" */
  517. /* parse "[%a, ]%d %b %Y " part, then use time parsing as above */
  518. for (; ISALPHA((unsigned char)*s); s++)
  519. ;
  520. for (; ISSPACE((unsigned char)*s); s++)
  521. ;
  522. if (*s == ',')
  523. s++;
  524. for (; ISSPACE((unsigned char)*s); s++)
  525. ;
  526. for (v = 0, i = 0; i < 2 && ISDIGIT((unsigned char)*s); s++, i++)
  527. v = (v * 10) + (*s - '0');
  528. va[2] = v; /* day */
  529. for (; ISSPACE((unsigned char)*s); s++)
  530. ;
  531. /* end of word month */
  532. for (j = 0; ISALPHA((unsigned char)s[j]); j++)
  533. ;
  534. /* check month name */
  535. if (j < 3 || j > 9)
  536. return -1; /* month cannot match */
  537. for (m = 0; m < sizeof(mons) / sizeof(*mons); m++) {
  538. /* abbreviation (3 length) or long name */
  539. if ((j == 3 || j == mons[m].len) &&
  540. !strncasecmp(mons[m].name, s, j)) {
  541. va[1] = m + 1;
  542. s += j;
  543. break;
  544. }
  545. }
  546. if (m >= 12)
  547. return -1; /* no month found */
  548. for (; ISSPACE((unsigned char)*s); s++)
  549. ;
  550. for (v = 0, i = 0; i < 4 && ISDIGIT((unsigned char)*s); s++, i++)
  551. v = (v * 10) + (*s - '0');
  552. /* obsolete short year: RFC2822 4.3 */
  553. if (i <= 3)
  554. v += (v >= 0 && v <= 49) ? 2000 : 1900;
  555. va[0] = v; /* year */
  556. for (; ISSPACE((unsigned char)*s); s++)
  557. ;
  558. /* parse only regular time part, see below */
  559. vi = 3;
  560. }
  561. /* parse time parts (and possibly remaining date parts) */
  562. for (; *s && vi < 6; vi++) {
  563. for (i = 0, v = 0; i < ((vi == 0) ? 4 : 2) &&
  564. ISDIGIT((unsigned char)*s); s++, i++) {
  565. v = (v * 10) + (*s - '0');
  566. }
  567. va[vi] = v;
  568. if ((vi < 2 && *s == '-') ||
  569. (vi == 2 && (*s == 'T' || ISSPACE((unsigned char)*s))) ||
  570. (vi > 2 && *s == ':'))
  571. s++;
  572. }
  573. /* skip milliseconds in for example: "%Y-%m-%dT%H:%M:%S.000Z" */
  574. if (*s == '.') {
  575. for (s++; ISDIGIT((unsigned char)*s); s++)
  576. ;
  577. }
  578. /* invalid range */
  579. if (va[0] < 0 || va[0] > 9999 ||
  580. va[1] < 1 || va[1] > 12 ||
  581. va[2] < 1 || va[2] > 31 ||
  582. va[3] < 0 || va[3] > 23 ||
  583. va[4] < 0 || va[4] > 59 ||
  584. va[5] < 0 || va[5] > 60) /* allow leap second */
  585. return -1;
  586. *tp = datetounix(va[0] - 1900, va[1] - 1, va[2], va[3], va[4], va[5]) -
  587. gettzoffset(s);
  588. return 0;
  589. }
  590. static void
  591. printfields(void)
  592. {
  593. string_print_timestamp(&ctx.fields[FeedFieldTime].str);
  594. putchar(FieldSeparator);
  595. string_print_trimmed(&ctx.fields[FeedFieldTitle].str);
  596. putchar(FieldSeparator);
  597. string_print_uri(&ctx.fields[FeedFieldLink].str);
  598. putchar(FieldSeparator);
  599. string_print_encoded(&ctx.fields[FeedFieldContent].str);
  600. putchar(FieldSeparator);
  601. fputs(contenttypes[ctx.contenttype], stdout);
  602. putchar(FieldSeparator);
  603. string_print_trimmed(&ctx.fields[FeedFieldId].str);
  604. putchar(FieldSeparator);
  605. string_print_trimmed(&ctx.fields[FeedFieldAuthor].str);
  606. putchar(FieldSeparator);
  607. string_print_uri(&ctx.fields[FeedFieldEnclosure].str);
  608. putchar(FieldSeparator);
  609. string_print_trimmed_multi(&ctx.fields[FeedFieldCategory].str);
  610. putchar('\n');
  611. if (ferror(stdout)) /* check for errors but do not flush */
  612. checkfileerror(stdout, "<stdout>", 'w');
  613. }
  614. static int
  615. istag(const char *name, size_t len, const char *name2, size_t len2)
  616. {
  617. return (len == len2 && !strcasecmp(name, name2));
  618. }
  619. static int
  620. isattr(const char *name, size_t len, const char *name2, size_t len2)
  621. {
  622. return (len == len2 && !strcasecmp(name, name2));
  623. }
  624. static void
  625. xmlattr(XMLParser *p, const char *t, size_t tl, const char *n, size_t nl,
  626. const char *v, size_t vl)
  627. {
  628. /* handles transforming inline XML to data */
  629. if (ISINCONTENT(ctx)) {
  630. if (ctx.contenttype == ContentTypeHTML)
  631. xmldata(p, v, vl);
  632. return;
  633. }
  634. if (!ctx.tag.id)
  635. return;
  636. /* content-type may be: Atom: text, xhtml, html or mime-type.
  637. MRSS (media:description): plain, html. */
  638. if (ISCONTENTTAG(ctx)) {
  639. if (isattr(n, nl, STRP("type")))
  640. string_append(&attrtype, v, vl);
  641. return;
  642. }
  643. if (ctx.feedtype == FeedTypeRSS) {
  644. if (ctx.tag.id == RSSTagEnclosure &&
  645. isattr(n, nl, STRP("url"))) {
  646. string_append(&tmpstr, v, vl);
  647. } else if (ctx.tag.id == RSSTagGuid &&
  648. isattr(n, nl, STRP("ispermalink"))) {
  649. string_append(&attrispermalink, v, vl);
  650. }
  651. } else if (ctx.feedtype == FeedTypeAtom) {
  652. if (ctx.tag.id == AtomTagLink) {
  653. if (isattr(n, nl, STRP("rel"))) {
  654. string_append(&attrrel, v, vl);
  655. } else if (isattr(n, nl, STRP("href"))) {
  656. string_append(&tmpstr, v, vl);
  657. }
  658. } else if (ctx.tag.id == AtomTagCategory &&
  659. isattr(n, nl, STRP("term"))) {
  660. string_append(&tmpstr, v, vl);
  661. }
  662. }
  663. }
  664. static void
  665. xmlattrentity(XMLParser *p, const char *t, size_t tl, const char *n, size_t nl,
  666. const char *data, size_t datalen)
  667. {
  668. char buf[16];
  669. int len;
  670. /* handles transforming inline XML to data */
  671. if (ISINCONTENT(ctx)) {
  672. if (ctx.contenttype == ContentTypeHTML)
  673. xmldata(p, data, datalen);
  674. return;
  675. }
  676. if (!ctx.tag.id)
  677. return;
  678. /* try to translate entity, else just pass as data to
  679. * xmlattr handler. */
  680. if ((len = xml_entitytostr(data, buf, sizeof(buf))) > 0)
  681. xmlattr(p, t, tl, n, nl, buf, (size_t)len);
  682. else
  683. xmlattr(p, t, tl, n, nl, data, datalen);
  684. }
  685. static void
  686. xmlattrend(XMLParser *p, const char *t, size_t tl, const char *n, size_t nl)
  687. {
  688. if (ISINCONTENT(ctx)) {
  689. if (ctx.contenttype == ContentTypeHTML) {
  690. /* handles transforming inline XML to data */
  691. xmldata(p, "\"", 1);
  692. ctx.attrcount = 0;
  693. }
  694. return;
  695. }
  696. }
  697. static void
  698. xmlattrstart(XMLParser *p, const char *t, size_t tl, const char *n, size_t nl)
  699. {
  700. if (ISINCONTENT(ctx)) {
  701. if (ctx.contenttype == ContentTypeHTML) {
  702. /* handles transforming inline XML to data */
  703. if (!ctx.attrcount)
  704. xmldata(p, " ", 1);
  705. ctx.attrcount++;
  706. xmldata(p, n, nl);
  707. xmldata(p, "=\"", 2);
  708. }
  709. return;
  710. }
  711. if (attrispermalink.len && isattr(n, nl, STRP("ispermalink")))
  712. string_clear(&attrispermalink);
  713. else if (attrrel.len && isattr(n, nl, STRP("rel")))
  714. string_clear(&attrrel);
  715. else if (attrtype.len && isattr(n, nl, STRP("type")))
  716. string_clear(&attrtype);
  717. else if (tmpstr.len &&
  718. (isattr(n, nl, STRP("href")) ||
  719. isattr(n, nl, STRP("term")) ||
  720. isattr(n, nl, STRP("url"))))
  721. string_clear(&tmpstr); /* use the last value for multiple attribute values */
  722. }
  723. static void
  724. xmldata(XMLParser *p, const char *s, size_t len)
  725. {
  726. if (!ctx.field)
  727. return;
  728. if (ISFEEDFIELDMULTI(fieldmap[ctx.tag.id]))
  729. string_append(&tmpstr, s, len);
  730. else
  731. string_append(ctx.field, s, len);
  732. }
  733. static void
  734. xmldataentity(XMLParser *p, const char *data, size_t datalen)
  735. {
  736. char buf[16];
  737. int len;
  738. if (!ctx.field)
  739. return;
  740. /* try to translate entity, else just pass as data to
  741. * xmldata handler. */
  742. if ((len = xml_entitytostr(data, buf, sizeof(buf))) > 0)
  743. xmldata(p, buf, (size_t)len);
  744. else
  745. xmldata(p, data, datalen);
  746. }
  747. static void
  748. xmltagstart(XMLParser *p, const char *t, size_t tl)
  749. {
  750. const FeedTag *f;
  751. if (ISINCONTENT(ctx)) {
  752. if (ctx.contenttype == ContentTypeHTML) {
  753. ctx.attrcount = 0;
  754. xmldata(p, "<", 1);
  755. xmldata(p, t, tl);
  756. }
  757. return;
  758. }
  759. /* start of RSS or Atom item / entry */
  760. if (ctx.feedtype == FeedTypeNone) {
  761. if (istag(t, tl, STRP("entry")))
  762. ctx.feedtype = FeedTypeAtom;
  763. else if (istag(t, tl, STRP("item")))
  764. ctx.feedtype = FeedTypeRSS;
  765. return;
  766. }
  767. /* field tagid already set or nested tags. */
  768. if (ctx.tag.id) {
  769. /* nested <author><name> for Atom */
  770. if (ctx.tag.id == AtomTagAuthor &&
  771. istag(t, tl, STRP("name"))) {
  772. memcpy(&(ctx.tag), &atomtagauthorname, sizeof(ctx.tag));
  773. } else {
  774. return; /* other nested tags are not allowed: return */
  775. }
  776. }
  777. /* in item */
  778. if (ctx.tag.id == TagUnknown) {
  779. if (!(f = gettag(ctx.feedtype, t, tl)))
  780. f = &notag;
  781. memcpy(&(ctx.tag), f, sizeof(ctx.tag));
  782. }
  783. ctx.iscontenttag = (fieldmap[ctx.tag.id] == FeedFieldContent);
  784. string_clear(&attrispermalink);
  785. string_clear(&attrrel);
  786. string_clear(&attrtype);
  787. }
  788. static void
  789. xmltagstartparsed(XMLParser *p, const char *t, size_t tl, int isshort)
  790. {
  791. enum TagId tagid;
  792. if (ISINCONTENT(ctx)) {
  793. if (ctx.contenttype == ContentTypeHTML) {
  794. if (isshort)
  795. xmldata(p, "/>", 2);
  796. else
  797. xmldata(p, ">", 1);
  798. }
  799. return;
  800. }
  801. /* set tag type based on it's attribute value */
  802. if (ctx.tag.id == RSSTagGuid) {
  803. /* if empty the default is "true" */
  804. if (!attrispermalink.len ||
  805. isattr(attrispermalink.data, attrispermalink.len, STRP("true")))
  806. ctx.tag.id = RSSTagGuidPermalinkTrue;
  807. else
  808. ctx.tag.id = RSSTagGuidPermalinkFalse;
  809. } else if (ctx.tag.id == AtomTagLink) {
  810. /* empty or "alternate": other types could be
  811. "enclosure", "related", "self" or "via" */
  812. if (!attrrel.len || isattr(attrrel.data, attrrel.len, STRP("alternate")))
  813. ctx.tag.id = AtomTagLinkAlternate;
  814. else if (isattr(attrrel.data, attrrel.len, STRP("enclosure")))
  815. ctx.tag.id = AtomTagLinkEnclosure;
  816. else
  817. ctx.tag.id = AtomTagLink; /* unknown */
  818. }
  819. tagid = ctx.tag.id;
  820. /* map tag type to field: unknown or lesser priority is ignored,
  821. when tags of the same type are repeated only the first is used. */
  822. if (fieldmap[tagid] == -1 ||
  823. (!ISFEEDFIELDMULTI(fieldmap[tagid]) &&
  824. tagid <= ctx.fields[fieldmap[tagid]].tagid)) {
  825. return;
  826. }
  827. if (ctx.iscontenttag) {
  828. ctx.iscontent = 1;
  829. ctx.iscontenttag = 0;
  830. /* detect content-type based on type attribute */
  831. if (attrtype.len) {
  832. if (isattr(attrtype.data, attrtype.len, STRP("html")) ||
  833. isattr(attrtype.data, attrtype.len, STRP("xhtml")) ||
  834. isattr(attrtype.data, attrtype.len, STRP("text/html")) ||
  835. isattr(attrtype.data, attrtype.len, STRP("text/xhtml")) ||
  836. isattr(attrtype.data, attrtype.len, STRP("application/xhtml+xml")))
  837. ctx.contenttype = ContentTypeHTML;
  838. else /* unknown: handle as base64 text data */
  839. ctx.contenttype = ContentTypePlain;
  840. } else {
  841. /* default content-type */
  842. if (tagid == RSSTagContentEncoded || tagid == RSSTagDescription)
  843. ctx.contenttype = ContentTypeHTML;
  844. else
  845. ctx.contenttype = ContentTypePlain;
  846. }
  847. }
  848. ctx.field = &(ctx.fields[fieldmap[tagid]].str);
  849. ctx.fields[fieldmap[tagid]].tagid = tagid;
  850. /* clear field if it is overwritten (with a priority order) for the new
  851. value, if the field can have multiple values then do not clear it. */
  852. if (!ISFEEDFIELDMULTI(fieldmap[ctx.tag.id]))
  853. string_clear(ctx.field);
  854. }
  855. static void
  856. xmltagend(XMLParser *p, const char *t, size_t tl, int isshort)
  857. {
  858. size_t i;
  859. if (ctx.feedtype == FeedTypeNone)
  860. return;
  861. if (ISINCONTENT(ctx)) {
  862. /* not a closed content field */
  863. if (!istag(ctx.tag.name, ctx.tag.len, t, tl)) {
  864. if (!isshort && ctx.contenttype == ContentTypeHTML) {
  865. xmldata(p, "</", 2);
  866. xmldata(p, t, tl);
  867. xmldata(p, ">", 1);
  868. }
  869. return;
  870. }
  871. } else if (ctx.tag.id && istag(ctx.tag.name, ctx.tag.len, t, tl)) {
  872. /* matched tag end: close it */
  873. /* copy also to the link field if the attribute isPermaLink="true"
  874. and it is not set by a tag with higher priority. */
  875. if (ctx.tag.id == RSSTagGuidPermalinkTrue && ctx.field &&
  876. ctx.tag.id > ctx.fields[FeedFieldLink].tagid) {
  877. string_clear(&ctx.fields[FeedFieldLink].str);
  878. string_append(&ctx.fields[FeedFieldLink].str,
  879. ctx.field->data, ctx.field->len);
  880. ctx.fields[FeedFieldLink].tagid = ctx.tag.id;
  881. }
  882. } else if (!ctx.tag.id && ((ctx.feedtype == FeedTypeAtom &&
  883. istag(t, tl, STRP("entry"))) || /* Atom */
  884. (ctx.feedtype == FeedTypeRSS &&
  885. istag(t, tl, STRP("item"))))) /* RSS */
  886. {
  887. /* end of RSS or Atom entry / item */
  888. printfields();
  889. /* clear strings */
  890. for (i = 0; i < FeedFieldLast; i++) {
  891. string_clear(&ctx.fields[i].str);
  892. ctx.fields[i].tagid = TagUnknown;
  893. }
  894. ctx.contenttype = ContentTypeNone;
  895. /* allow parsing of Atom and RSS concatenated in one XML stream. */
  896. ctx.feedtype = FeedTypeNone;
  897. } else {
  898. return; /* not end of field */
  899. }
  900. /* temporary string: for fields that cannot be processed
  901. directly and need more context, for example by it's tag
  902. attributes, like the Atom link rel="alternate|enclosure". */
  903. if (tmpstr.len && ctx.field) {
  904. if (ISFEEDFIELDMULTI(fieldmap[ctx.tag.id])) {
  905. if (ctx.field->len)
  906. string_append(ctx.field, FieldMultiSeparator, 1);
  907. string_append(ctx.field, tmpstr.data, tmpstr.len);
  908. } else {
  909. string_clear(ctx.field);
  910. string_append(ctx.field, tmpstr.data, tmpstr.len);
  911. }
  912. }
  913. /* close field */
  914. string_clear(&tmpstr); /* reuse and clear temporary string */
  915. if (ctx.tag.id == AtomTagAuthorName)
  916. memcpy(&(ctx.tag), &atomtagauthor, sizeof(ctx.tag)); /* outer tag */
  917. else
  918. memcpy(&(ctx.tag), &notag, sizeof(ctx.tag));
  919. ctx.iscontent = 0;
  920. ctx.field = NULL;
  921. }
  922. int
  923. main(int argc, char *argv[])
  924. {
  925. if (pledge("stdio", NULL) == -1)
  926. err(1, "pledge");
  927. if (argc > 1) {
  928. if (uri_parse(argv[1], &baseuri) != -1 && baseuri.proto[0])
  929. baseurl = argv[1];
  930. else
  931. errx(1, "baseurl incorrect or too long");
  932. }
  933. memcpy(&(ctx.tag), &notag, sizeof(ctx.tag));
  934. parser.xmlattr = xmlattr;
  935. parser.xmlattrentity = xmlattrentity;
  936. parser.xmlattrend = xmlattrend;
  937. parser.xmlattrstart = xmlattrstart;
  938. parser.xmlcdata = xmldata;
  939. parser.xmldata = xmldata;
  940. parser.xmldataentity = xmldataentity;
  941. parser.xmltagend = xmltagend;
  942. parser.xmltagstart = xmltagstart;
  943. parser.xmltagstartparsed = xmltagstartparsed;
  944. /* NOTE: getnext is defined in xml.h for inline optimization */
  945. xml_parse(&parser);
  946. checkfileerror(stdin, "<stdin>", 'r');
  947. checkfileerror(stdout, "<stdout>", 'w');
  948. return 0;
  949. }