wiki_render.c 142 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207120812091210121112121213121412151216121712181219122012211222122312241225122612271228122912301231123212331234123512361237123812391240124112421243124412451246124712481249125012511252125312541255125612571258125912601261126212631264126512661267126812691270127112721273127412751276127712781279128012811282128312841285128612871288128912901291129212931294129512961297129812991300130113021303130413051306130713081309131013111312131313141315131613171318131913201321132213231324132513261327132813291330133113321333133413351336133713381339134013411342134313441345134613471348134913501351135213531354135513561357135813591360136113621363136413651366136713681369137013711372137313741375137613771378137913801381138213831384138513861387138813891390139113921393139413951396139713981399140014011402140314041405140614071408140914101411141214131414141514161417141814191420142114221423142414251426142714281429143014311432143314341435143614371438143914401441144214431444144514461447144814491450145114521453145414551456145714581459146014611462146314641465146614671468146914701471147214731474147514761477147814791480148114821483148414851486148714881489149014911492149314941495149614971498149915001501150215031504150515061507150815091510151115121513151415151516151715181519152015211522152315241525152615271528152915301531153215331534153515361537153815391540154115421543154415451546154715481549155015511552155315541555155615571558155915601561156215631564156515661567156815691570157115721573157415751576157715781579158015811582158315841585158615871588158915901591159215931594159515961597159815991600160116021603160416051606160716081609161016111612161316141615161616171618161916201621162216231624162516261627162816291630163116321633163416351636163716381639164016411642164316441645164616471648164916501651165216531654165516561657165816591660166116621663166416651666166716681669167016711672167316741675167616771678167916801681168216831684168516861687168816891690169116921693169416951696169716981699170017011702170317041705170617071708170917101711171217131714171517161717171817191720172117221723172417251726172717281729173017311732173317341735173617371738173917401741174217431744174517461747174817491750175117521753175417551756175717581759176017611762176317641765176617671768176917701771177217731774177517761777177817791780178117821783178417851786178717881789179017911792179317941795179617971798179918001801180218031804180518061807180818091810181118121813181418151816181718181819182018211822182318241825182618271828182918301831183218331834183518361837183818391840184118421843184418451846184718481849185018511852185318541855185618571858185918601861186218631864186518661867186818691870187118721873187418751876187718781879188018811882188318841885188618871888188918901891189218931894189518961897189818991900190119021903190419051906190719081909191019111912191319141915191619171918191919201921192219231924192519261927192819291930193119321933193419351936193719381939194019411942194319441945194619471948194919501951195219531954195519561957195819591960196119621963196419651966196719681969197019711972197319741975197619771978197919801981198219831984198519861987198819891990199119921993199419951996199719981999200020012002200320042005200620072008200920102011201220132014201520162017201820192020202120222023202420252026202720282029203020312032203320342035203620372038203920402041204220432044204520462047204820492050205120522053205420552056205720582059206020612062206320642065206620672068206920702071207220732074207520762077207820792080208120822083208420852086208720882089209020912092209320942095209620972098209921002101210221032104210521062107210821092110211121122113211421152116211721182119212021212122212321242125212621272128212921302131213221332134213521362137213821392140214121422143214421452146214721482149215021512152215321542155215621572158215921602161216221632164216521662167216821692170217121722173217421752176217721782179218021812182218321842185218621872188218921902191219221932194219521962197219821992200220122022203220422052206220722082209221022112212221322142215221622172218221922202221222222232224222522262227222822292230223122322233223422352236223722382239224022412242224322442245224622472248224922502251225222532254225522562257225822592260226122622263226422652266226722682269227022712272227322742275227622772278227922802281228222832284228522862287228822892290229122922293229422952296229722982299230023012302230323042305230623072308230923102311231223132314231523162317231823192320232123222323232423252326232723282329233023312332233323342335233623372338233923402341234223432344234523462347234823492350235123522353235423552356235723582359236023612362236323642365236623672368236923702371237223732374237523762377237823792380238123822383238423852386238723882389239023912392239323942395239623972398239924002401240224032404240524062407240824092410241124122413241424152416241724182419242024212422242324242425242624272428242924302431243224332434243524362437243824392440244124422443244424452446244724482449245024512452245324542455245624572458245924602461246224632464246524662467246824692470247124722473247424752476247724782479248024812482248324842485248624872488248924902491249224932494249524962497249824992500250125022503250425052506250725082509251025112512251325142515251625172518251925202521252225232524252525262527252825292530253125322533253425352536253725382539254025412542254325442545254625472548254925502551255225532554255525562557255825592560256125622563256425652566256725682569257025712572257325742575257625772578257925802581258225832584258525862587258825892590259125922593259425952596259725982599260026012602260326042605260626072608260926102611261226132614261526162617261826192620262126222623262426252626262726282629263026312632263326342635263626372638263926402641264226432644264526462647264826492650265126522653265426552656265726582659266026612662266326642665266626672668266926702671267226732674267526762677267826792680268126822683268426852686268726882689269026912692269326942695269626972698269927002701270227032704270527062707270827092710271127122713271427152716271727182719272027212722272327242725272627272728272927302731273227332734273527362737273827392740274127422743274427452746274727482749275027512752275327542755275627572758275927602761276227632764276527662767276827692770277127722773277427752776277727782779278027812782278327842785278627872788278927902791279227932794279527962797279827992800280128022803280428052806280728082809281028112812281328142815281628172818281928202821282228232824282528262827282828292830283128322833283428352836283728382839284028412842284328442845284628472848284928502851285228532854285528562857285828592860286128622863286428652866286728682869287028712872287328742875287628772878287928802881288228832884288528862887288828892890289128922893289428952896289728982899290029012902290329042905290629072908290929102911291229132914291529162917291829192920292129222923292429252926292729282929293029312932293329342935293629372938293929402941294229432944294529462947294829492950295129522953295429552956295729582959296029612962296329642965296629672968296929702971297229732974297529762977297829792980298129822983298429852986298729882989299029912992299329942995299629972998299930003001300230033004300530063007300830093010301130123013301430153016301730183019302030213022302330243025302630273028302930303031303230333034303530363037303830393040304130423043304430453046304730483049305030513052305330543055305630573058305930603061306230633064306530663067306830693070307130723073307430753076307730783079308030813082308330843085308630873088308930903091309230933094309530963097309830993100310131023103310431053106310731083109311031113112311331143115311631173118311931203121312231233124312531263127312831293130313131323133313431353136313731383139314031413142314331443145314631473148314931503151315231533154315531563157315831593160316131623163316431653166316731683169317031713172317331743175317631773178317931803181318231833184318531863187318831893190319131923193319431953196319731983199320032013202320332043205320632073208320932103211321232133214321532163217321832193220322132223223322432253226322732283229323032313232323332343235323632373238323932403241324232433244324532463247324832493250325132523253325432553256325732583259326032613262326332643265326632673268326932703271327232733274327532763277327832793280328132823283328432853286328732883289329032913292329332943295329632973298329933003301330233033304330533063307330833093310331133123313331433153316331733183319332033213322332333243325332633273328332933303331333233333334333533363337333833393340334133423343334433453346334733483349335033513352335333543355335633573358335933603361336233633364336533663367336833693370337133723373337433753376337733783379338033813382338333843385338633873388338933903391339233933394339533963397339833993400340134023403340434053406340734083409341034113412341334143415341634173418341934203421342234233424342534263427342834293430343134323433343434353436343734383439344034413442344334443445344634473448344934503451345234533454345534563457345834593460346134623463346434653466346734683469347034713472347334743475347634773478347934803481348234833484348534863487348834893490349134923493349434953496349734983499350035013502350335043505350635073508350935103511351235133514351535163517351835193520352135223523352435253526352735283529353035313532353335343535353635373538353935403541354235433544354535463547354835493550355135523553355435553556355735583559356035613562356335643565356635673568356935703571357235733574357535763577357835793580358135823583358435853586358735883589359035913592359335943595359635973598359936003601360236033604360536063607360836093610361136123613361436153616361736183619362036213622362336243625362636273628362936303631363236333634363536363637363836393640364136423643364436453646364736483649365036513652365336543655365636573658365936603661366236633664366536663667366836693670367136723673367436753676367736783679368036813682368336843685368636873688368936903691369236933694369536963697369836993700370137023703370437053706370737083709371037113712371337143715371637173718371937203721372237233724372537263727372837293730373137323733373437353736373737383739374037413742374337443745374637473748374937503751375237533754375537563757375837593760376137623763376437653766376737683769377037713772377337743775377637773778377937803781378237833784378537863787378837893790379137923793379437953796379737983799380038013802380338043805380638073808380938103811381238133814381538163817381838193820382138223823382438253826382738283829383038313832383338343835383638373838383938403841384238433844384538463847384838493850385138523853385438553856385738583859386038613862386338643865386638673868386938703871387238733874387538763877387838793880388138823883388438853886388738883889389038913892389338943895389638973898389939003901390239033904390539063907390839093910391139123913391439153916391739183919392039213922392339243925392639273928392939303931393239333934393539363937393839393940394139423943394439453946394739483949395039513952395339543955395639573958395939603961396239633964396539663967396839693970397139723973397439753976397739783979398039813982398339843985398639873988398939903991399239933994399539963997399839994000400140024003400440054006400740084009401040114012401340144015401640174018401940204021402240234024402540264027402840294030403140324033403440354036403740384039404040414042404340444045404640474048404940504051405240534054405540564057405840594060406140624063406440654066406740684069407040714072407340744075407640774078407940804081408240834084408540864087408840894090409140924093409440954096409740984099410041014102410341044105410641074108410941104111411241134114411541164117411841194120412141224123412441254126412741284129413041314132413341344135413641374138413941404141414241434144414541464147414841494150415141524153415441554156415741584159416041614162416341644165416641674168416941704171417241734174417541764177417841794180418141824183418441854186418741884189419041914192419341944195419641974198419942004201420242034204420542064207420842094210421142124213421442154216421742184219422042214222422342244225422642274228422942304231423242334234423542364237423842394240424142424243424442454246424742484249425042514252425342544255425642574258425942604261426242634264426542664267426842694270427142724273427442754276427742784279428042814282428342844285428642874288428942904291429242934294429542964297429842994300430143024303430443054306430743084309431043114312431343144315431643174318431943204321432243234324432543264327432843294330433143324333433443354336433743384339434043414342434343444345434643474348434943504351435243534354435543564357435843594360436143624363436443654366436743684369437043714372437343744375437643774378437943804381438243834384438543864387438843894390439143924393439443954396439743984399440044014402440344044405440644074408440944104411441244134414441544164417441844194420442144224423442444254426442744284429443044314432443344344435443644374438443944404441444244434444444544464447444844494450445144524453445444554456445744584459446044614462446344644465446644674468446944704471447244734474447544764477447844794480448144824483448444854486448744884489449044914492449344944495449644974498449945004501450245034504450545064507450845094510451145124513451445154516451745184519452045214522452345244525452645274528452945304531453245334534453545364537453845394540454145424543454445454546454745484549455045514552455345544555455645574558455945604561456245634564456545664567456845694570457145724573457445754576457745784579458045814582458345844585458645874588458945904591459245934594459545964597459845994600460146024603460446054606460746084609461046114612461346144615461646174618461946204621462246234624462546264627462846294630463146324633463446354636463746384639464046414642464346444645464646474648464946504651465246534654465546564657465846594660466146624663466446654666466746684669467046714672467346744675467646774678467946804681468246834684468546864687468846894690469146924693469446954696469746984699470047014702470347044705470647074708470947104711471247134714471547164717471847194720472147224723472447254726472747284729473047314732473347344735473647374738473947404741474247434744474547464747474847494750475147524753475447554756475747584759476047614762476347644765476647674768476947704771477247734774477547764777477847794780478147824783478447854786478747884789479047914792479347944795479647974798479948004801480248034804480548064807480848094810481148124813481448154816481748184819482048214822482348244825482648274828482948304831483248334834483548364837483848394840484148424843
  1. #include <stdio.h>
  2. #include <inttypes.h>
  3. #include <ctype.h>
  4. #include <errno.h>
  5. #include <stddef.h>
  6. #include <stdlib.h>
  7. #include <sys/stat.h>
  8. #include <sys/types.h>
  9. #include <wchar.h>
  10. #include <malloc.h>
  11. #include <stdarg.h>
  12. #include <string.h>
  13. #include <mysql/mysql.h>
  14. #include <time.h>
  15. #include <dirent.h>
  16. #include "bmf.h"
  17. #include "wiki_render.h"
  18. #include "lcd_buf_draw.h"
  19. #include "Alloc.h"
  20. #include "Bra.h"
  21. #include "LzmaEnc.h"
  22. #include "LzmaDec.h"
  23. #include "bigram.h"
  24. #include "search_hash.h"
  25. #define MAX_LOCAL_TEXT_BUF 512000
  26. #define LIST_TYPE_ORDERED 1
  27. #define LIST_TYPE_UNORDERED 2
  28. #define LIST_TYPE_DEFINITION 3
  29. #define PIXELS_PER_INDENT 20
  30. #define PIXELS_FIRST_INDENT 10
  31. #define MAX_LIST_DEPTH 10
  32. #define LIST_LI 1
  33. #define LIST_DT 2
  34. #define LIST_DD 3
  35. struct render_buf {
  36. int nCurrentLineIdx;
  37. int nCurrentFontIdx;
  38. int nCurrentRenderType;
  39. int nCurrentLineHeight;
  40. long nLines;
  41. long current_x;
  42. long current_y;
  43. int current_indent;
  44. int list_depth;
  45. char list_type[MAX_LIST_DEPTH];
  46. int ol_count[MAX_LIST_DEPTH];
  47. int nLinks;
  48. } render_buf;
  49. char render_buf_sLines[64*1024][256];
  50. #define MAX_LINKS 10240
  51. #define MAX_LINK_STRING 256
  52. #define MAX_CLASS_STRING 2048
  53. #define MAX_CELL_STRING 4096
  54. struct link {
  55. long start_x;
  56. long start_y;
  57. long end_x;
  58. long end_y;
  59. int bWikiLink;
  60. char sLink[MAX_LINK_STRING];
  61. } links[MAX_LINKS];
  62. char word_break_before_chars[] = "\n\r ";
  63. char word_break_after_chars[] = "\n\r )*+,-./:;<=>?]|}";
  64. #define WORD_BREAK_FLEX_CHAR_MASK1 0xF3000
  65. #define WORD_BREAK_FLEX_CHAR_MASK2 0xF4000
  66. #define WORD_BREAK_FLEX_CHAR_MASK3 0xF8000
  67. #define RENDER_TYPE_NORMAL 0
  68. #define RENDER_TYPE_LINK 1
  69. #define RENDER_TYPE_TABLE_DATA 2
  70. #define MAX_TAG_LEN 256
  71. #define WIKI_TAG_PAIR_TEMPLATE 0
  72. #define WIKI_TAG_PAIR_LINK 1
  73. #define WIKI_TAG_PAIR_TABLE 2
  74. #define WIKI_TAG_PAIR_TABLE_CAPTION 3
  75. #define WIKI_TAG_PAIR_TABLE_ROW 4
  76. #define WIKI_TAG_PAIR_TABLE_CELLS 5
  77. #define WIKI_TAG_PAIR_TABLE_HEADER 6
  78. #define WIKI_TAG_PAIR_EXTERNAL_LINK 7
  79. #define WIKI_TAG_PAIR_H6 8
  80. #define WIKI_TAG_PAIR_H5 9
  81. #define WIKI_TAG_PAIR_H4 10
  82. #define WIKI_TAG_PAIR_H3 11
  83. #define WIKI_TAG_PAIR_H2 12
  84. #define WIKI_TAG_PAIR_BOLD_ITALIC 13
  85. #define WIKI_TAG_PAIR_BOLD 14
  86. #define WIKI_TAG_PAIR_ITALIC 15
  87. #define WIKI_TAG_PAIR_NOWIKI 16
  88. #define WIKI_TAG_PAIR_PRE 17
  89. #define WIKI_TAG_PAIR_PRE_LINE 18
  90. #define WIKI_TAG_PAIR_SEP 19
  91. #define WIKI_TAG_PAIR_OL 20
  92. #define WIKI_TAG_PAIR_UL 21
  93. #define WIKI_TAG_PAIR_DT 22
  94. #define WIKI_TAG_PAIR_DD 23
  95. #define WIKI_TAG_PAIR_REF 24
  96. #define WIKI_TAG_PAIR_COMMENT 25
  97. #define WIKI_TAG_PAIR_TEXT 26
  98. #define MAX_WIKI_TAG_PAIRS WIKI_TAG_PAIR_TEXT
  99. struct wiki_tag_pair {
  100. char sTagStart[MAX_TAG_LEN];
  101. char sTagEnd[MAX_TAG_LEN];
  102. int lenTagStart;
  103. int lenTagEnd;
  104. int bGotChild;
  105. int bBeginOfLine;
  106. } wiki_tag_pairs[MAX_WIKI_TAG_PAIRS] = {
  107. {"{{", "}}", 2, 2, 0, 0}, // template
  108. {"[[", "]", 2, 1, 1, 0}, // link
  109. {"{|", "|}", 2, 2, 1, 1}, // table
  110. {"|+", "\n", 2, 1, 1, 1}, // table caption
  111. {"|-", "\n", 2, 1, 1, 1}, // row
  112. {"|", "\n", 1, 1, 1, 1}, // row cells
  113. {"!", "\n", 1, 1, 1, 1}, // column or row header
  114. {"[", "]", 1, 1, 0, 0}, // external link
  115. {"======", "======", 6, 6, 0, 1}, // H6
  116. {"=====", "=====", 5, 5, 0, 1}, // H5
  117. {"====", "====", 4, 4, 0, 1}, // H4
  118. {"===", "===", 3, 3, 0, 1}, // H3
  119. {"==", "==", 2, 2, 0, 1}, // H2
  120. {"\'\'\'\'\'", "\'\'\'\'\'", 5, 5, 1, 0}, // bold italic
  121. {"\'\'\'", "\'\'\'", 3, 3, 1, 0}, // bold
  122. {"\'\'", "\'\'", 2, 2, 1, 0}, // italic
  123. {"&lt;nowiki&gt;", "&lt;/nowiki&gt;", 14, 15, 0, 0}, // nowiki
  124. {"&lt;pre&gt;", "&lt;/pre&gt;", 11, 12, 0, 0}, // preserve format
  125. {" ", "\n", 1, 1, 1, 1}, // line with preserved format
  126. {"#####", "\n", 5, 1, 1, 1}, // separator line
  127. {"#", "\n", 1, 1, 1, 1}, // ordered list
  128. {"*", "\n", 1, 1, 1, 1}, // unordered list
  129. {";", "\n", 1, 1, 1, 1}, // definition term
  130. {":", "\n", 1, 1, 1, 1}, // definition
  131. {"&lt;ref", "&lt;/ref&gt;", 7, 12, 0, 0}, // reference (to be filtered out)
  132. {"&lt;!--", "--&gt;", 7, 6, 0, 0} // comment
  133. };
  134. #define MAX_WIKI_NODES 160000
  135. #define MAX_WIKI_TAG_STACK 128
  136. struct wiki_node { // one node for either start tag, end tag or content (string between tags)
  137. int idxTag;
  138. int bTagStart;
  139. char *pTag; // only meanful for content (string between tags)
  140. int lenTag;
  141. } *wiki_nodes;
  142. int nWikiNodeCount = 0;
  143. #define TAG_PAIR_STRONG 0
  144. #define TAG_PAIR_SCRIPT 1
  145. #define TAG_PAIR_DIV 2
  146. #define TAG_PAIR_SPAN 3
  147. #define TAG_PAIR_REFERENCE 4
  148. #define TAG_PAIR_BR 5
  149. #define TAG_PAIR_BR2 6
  150. #define TAG_PAIR_H1 7
  151. #define TAG_PAIR_H2 8
  152. #define TAG_PAIR_H3 9
  153. #define TAG_PAIR_H4 10
  154. #define TAG_PAIR_H5 11
  155. #define TAG_PAIR_H6 12
  156. #define TAG_PAIR_A 13
  157. #define TAG_PAIR_BIG_BIG_BIG 14
  158. #define TAG_PAIR_BIG_BIG 15
  159. #define TAG_PAIR_BIG 16
  160. #define TAG_PAIR_BI 17
  161. #define TAG_PAIR_B 18
  162. #define TAG_PAIR_I 19
  163. #define TAG_PAIR_DEL 20
  164. #define TAG_PAIR_INS 21
  165. #define TAG_PAIR_LI 22
  166. #define TAG_PAIR_OL 23
  167. #define TAG_PAIR_EOL 24
  168. #define TAG_PAIR_DL 25
  169. #define TAG_PAIR_EDL 26
  170. #define TAG_PAIR_DT 27
  171. #define TAG_PAIR_DD 28
  172. #define TAG_PAIR_P_END 29
  173. #define TAG_PAIR_P 30
  174. #define TAG_PAIR_REF 31
  175. #define TAG_PAIR_SUP 32
  176. #define TAG_PAIR_TABLE 33
  177. #define TAG_PAIR_TD 34
  178. #define TAG_PAIR_TH 35
  179. #define TAG_PAIR_TR 36
  180. #define TAG_PAIR_UL 37
  181. #define TAG_PAIR_EUL 38
  182. #define TAG_PAIR_HTTP 39
  183. #define TAG_PAIR_UNKNOWN 40
  184. #define MAX_TAG_PAIRS TAG_PAIR_UNKNOWN
  185. struct tag_pair {
  186. char sTagStart[MAX_TAG_LEN];
  187. char sTagEnd[MAX_TAG_LEN];
  188. int lenTagStart;
  189. int lenTagEnd;
  190. int bGotChild;
  191. } tag_pairs[MAX_TAG_PAIRS] = {
  192. {"<strong", "</strong>", 7, 9, 0},
  193. {"<script", "</script>", 7, 9, 0},
  194. {"<div", "</div>", 4, 6, 1},
  195. {"<span", "</span>", 5, 7, 1},
  196. {"<reference/", "", 11, 0, 1},
  197. {"<br", "", 3, 0, 0},
  198. {"<br/", "", 4, 0, 0},
  199. {"<h1", "</h1>", 3, 5, 1}, // Header 2
  200. {"<h2", "</h2>", 3, 5, 1}, // Header 2
  201. {"<h3", "</h3>", 3, 5, 1}, // Header 3
  202. {"<h4", "</h4>", 3, 5, 1}, // Header 4
  203. {"<h5", "</h5>", 3, 5, 1}, // Header 5
  204. {"<h6", "</h6>", 3, 5, 1}, // Header 6
  205. {"<a", "</a>", 2, 4, 1}, // <a> </a>
  206. {"<big><big><big", "</big></big></big>", 14, 18, 1},// big big big
  207. {"<big><big", "</big></big>", 9, 12, 1},// big big
  208. {"<big", "</big>", 4, 6, 1}, // big
  209. {"<b><i", "</i></b>", 5, 8, 1}, // bold
  210. {"<b", "</b>", 2, 4, 1}, // bold
  211. {"<i", "</i>", 2, 4, 1}, // italic
  212. {"<del", "</del>", 4, 6, 1}, // deleted text (striked out)
  213. {"<ins", "</ins>", 4, 6, 1}, // inserted text (underlined)
  214. {"<li", "</li>", 3, 5, 1}, // list item
  215. {"<ol", "", 3, 0, 0}, // order list start
  216. {"</ol", "", 4, 0, 0}, // order list end
  217. {"<dl", "", 3, 0, 0}, // definition list start
  218. {"</dl", "", 4, 0, 0}, // definition list end
  219. {"<dt", "</dt>", 3, 5, 1}, // definition term
  220. {"<dd", "</dd>", 3, 5, 1}, // definition
  221. {"<p/", "", 3, 0, 0},
  222. {"<p", "</p>", 2, 4, 1}, // <p>
  223. {"<ref", "</ref>", 4, 6, 1}, // <ref> </ref>
  224. {"<sup", "</sup>", 4, 6, 1}, // <sup>
  225. {"<table", "</table>", 6, 8, 1}, // table
  226. {"<td", "</td>", 3, 5, 1}, // table data
  227. {"<th", "</th>", 3, 5, 1}, // table header
  228. {"<tr", "</tr>", 3, 5, 1}, // table row
  229. {"<ul", "", 3, 0, 0}, // unorder list start
  230. {"</ul", "", 4, 0, 0}, // unorder list end
  231. {"<http:", "", 6, 0, 0}
  232. };
  233. #define MAX_ARTICLE_NODES 80000
  234. struct article_node {
  235. int idxTag;
  236. char *pTagDesc;
  237. int lenTagDesc;
  238. char *pContent;
  239. int len;
  240. int idxChildNode;
  241. int idxNextNode;
  242. } *article_nodes;
  243. int nArticleNodeCount = 0;
  244. //#define HEAP_ALLOC(var,size) \
  245. // lzo_align_t __LZO_MMODEL var [ ((size) + (sizeof(lzo_align_t) - 1)) / sizeof(lzo_align_t) ]
  246. //static HEAP_ALLOC(wrkmem,LZO1X_999_MEM_COMPRESS);
  247. static void *SzAlloc(void *p, size_t size) { p = p; return malloc(size); }
  248. static void SzFree(void *p, void *address) { p = p; free(address); }
  249. static ISzAlloc g_Alloc = { SzAlloc, SzFree };
  250. extern int nMsgLevel;
  251. long process_wiki_tag(int *idxNode, int maxNode, char *sText, char *sBuf, long lenBuf, long maxLenBuf, int bInList);
  252. void processing_speed(long nCount)
  253. {
  254. static time_t base_t;
  255. time_t current_t;
  256. double elapsed_seconds;
  257. uint32_t nCountPerSecond;
  258. if (nCount == 0)
  259. {
  260. time(&base_t);
  261. }
  262. else
  263. {
  264. time(&current_t);
  265. elapsed_seconds = difftime(current_t, base_t);
  266. if (elapsed_seconds > 1)
  267. {
  268. nCountPerSecond = (long)(nCount / elapsed_seconds);
  269. showMsg(0, "%ld titles processed in %ld seconds, %ld titles per second\n", nCount, (long)elapsed_seconds, nCountPerSecond);
  270. }
  271. }
  272. }
  273. char *strnchr(char *s, char c, int len)
  274. {
  275. int bFound = 0;
  276. while (!bFound && len > 0)
  277. {
  278. if (*s == c)
  279. bFound = 1;
  280. else
  281. {
  282. s++;
  283. len--;
  284. }
  285. }
  286. if (bFound)
  287. return s;
  288. else
  289. return NULL;
  290. }
  291. char *strnstr(char *s1, char *s2, int len)
  292. {
  293. int bFound = 0;
  294. int s2_len = strlen(s2);
  295. while (!bFound && len >= s2_len)
  296. {
  297. if (!memcmp(s1, s2, s2_len))
  298. bFound = 1;
  299. else
  300. {
  301. s1++;
  302. len--;
  303. }
  304. }
  305. if (bFound)
  306. return s1;
  307. else
  308. return NULL;
  309. }
  310. long srting_len_fit_width(int idxFont, long *xCurrent, int nWidthLimit, char *pContent, long lenContent)
  311. {
  312. int lenLastBreak;
  313. int lenCurrent;
  314. int bIsBreakBeforeChar;
  315. int bIsBreakAfterChar;
  316. int nCharBytes;
  317. int xCurrentLocal;
  318. int bNewLine = 0;
  319. if (*xCurrent <= render_buf.current_indent)
  320. bNewLine = 1;
  321. lenLastBreak = 0;
  322. lenCurrent = 0;
  323. xCurrentLocal = (int)*xCurrent;
  324. if (xCurrentLocal < 0)
  325. xCurrentLocal = 0;
  326. while (xCurrentLocal < nWidthLimit && lenContent > 0 && *pContent != '\n')
  327. {
  328. bIsBreakBeforeChar = strchr(word_break_before_chars, *pContent) ||
  329. (*pContent & WORD_BREAK_FLEX_CHAR_MASK1) ||
  330. (*pContent & WORD_BREAK_FLEX_CHAR_MASK2) ||
  331. (*pContent & WORD_BREAK_FLEX_CHAR_MASK3);
  332. bIsBreakAfterChar = strchr(word_break_after_chars, *pContent) ||
  333. (*pContent & WORD_BREAK_FLEX_CHAR_MASK1) ||
  334. (*pContent & WORD_BREAK_FLEX_CHAR_MASK2) ||
  335. (*pContent & WORD_BREAK_FLEX_CHAR_MASK3);
  336. if (bIsBreakBeforeChar)
  337. {
  338. lenLastBreak = lenCurrent;
  339. *xCurrent = xCurrentLocal;
  340. }
  341. xCurrentLocal += get_UTF8_char_width(idxFont, &pContent, &lenContent, &nCharBytes);
  342. if (xCurrentLocal < nWidthLimit)
  343. {
  344. lenCurrent += nCharBytes;
  345. if (bIsBreakAfterChar)
  346. {
  347. lenLastBreak = lenCurrent;
  348. *xCurrent = xCurrentLocal;
  349. }
  350. }
  351. else
  352. {
  353. if (!lenLastBreak)
  354. {
  355. if (!bNewLine)
  356. return 0; // if not beginning of line, move to the next line
  357. lenLastBreak = lenCurrent;
  358. *xCurrent = xCurrentLocal;
  359. // if no last break point, just return lenCurrent
  360. }
  361. else
  362. {
  363. lenContent = lenLastBreak;
  364. }
  365. }
  366. }
  367. if (!lenContent || !lenLastBreak) // if all string is comsumed or no break point
  368. {
  369. *xCurrent = xCurrentLocal;
  370. return lenCurrent;
  371. }
  372. else
  373. return lenLastBreak;
  374. }
  375. void render_title(char *pTitle, long nTitleLen)
  376. {
  377. long nTitleLenFitWidth;
  378. int nLineHeight;
  379. int i;
  380. for (i=0; i < nTitleLen; i++)
  381. {
  382. if (pTitle[i] == '~')
  383. pTitle[i] = ':';
  384. else if (pTitle[i] == '_')
  385. pTitle[i] = ' ';
  386. }
  387. nLineHeight = pcfFonts[TITLE_FONT_IDX - 1].Fmetrics.linespace;
  388. render_buf.nLines++;
  389. memset(render_buf_sLines[render_buf.nLines-1], 0, 256);
  390. render_buf.nCurrentLineIdx = 0;
  391. render_buf_sLines[render_buf.nLines-1][render_buf.nCurrentLineIdx++] = (char)ESC_0_SPACE_LINE;
  392. render_buf_sLines[render_buf.nLines-1][render_buf.nCurrentLineIdx++] = 5;
  393. render_buf.current_y += 5;
  394. while (nTitleLen > 0)
  395. {
  396. render_buf.nLines++;
  397. memset(render_buf_sLines[render_buf.nLines-1], 0, 256);
  398. render_buf.nCurrentLineIdx = 0;
  399. render_buf_sLines[render_buf.nLines-1][render_buf.nCurrentLineIdx++] = (char)ESC_3_NEW_LINE_WITH_FONT;
  400. render_buf_sLines[render_buf.nLines-1][render_buf.nCurrentLineIdx++] = (char)(TITLE_FONT_IDX | (nLineHeight << 3));
  401. nTitleLenFitWidth = srting_len_fit_width(TITLE_FONT_IDX, &render_buf.current_x, LCD_BUF_WIDTH_PIXELS - LCD_LEFT_MARGIN, pTitle, nTitleLen);
  402. strncpy(&render_buf_sLines[render_buf.nLines-1][render_buf.nCurrentLineIdx] , pTitle, nTitleLenFitWidth);
  403. render_buf.nCurrentLineIdx += nTitleLenFitWidth;
  404. render_buf_sLines[render_buf.nLines-1][render_buf.nCurrentLineIdx] = '\0';
  405. nTitleLen -= nTitleLenFitWidth;
  406. pTitle += nTitleLenFitWidth;
  407. render_buf.current_y += nLineHeight;
  408. render_buf.current_x = -1;
  409. }
  410. render_buf.nLines++;
  411. memset(render_buf_sLines[render_buf.nLines-1], 0, 256);
  412. render_buf.nCurrentLineIdx = 0;
  413. render_buf_sLines[render_buf.nLines-1][render_buf.nCurrentLineIdx++] = (char)ESC_0_SPACE_LINE;
  414. render_buf_sLines[render_buf.nLines-1][render_buf.nCurrentLineIdx++] = 6;
  415. // render_buf_sLines[render_buf.nLines-1][render_buf.nCurrentLineIdx++] = (char)ESC_12_FULL_HORIZONTAL_LINE;
  416. // render_buf_sLines[render_buf.nLines-1][render_buf.nCurrentLineIdx++] = (char)ESC_0_SPACE_LINE;
  417. // render_buf_sLines[render_buf.nLines-1][render_buf.nCurrentLineIdx++] = 4;
  418. render_buf_sLines[render_buf.nLines-1][render_buf.nCurrentLineIdx] = '\0';
  419. render_buf.current_y += 7;
  420. render_buf.current_x = -1;
  421. render_buf.nCurrentLineIdx = 0;
  422. render_buf.nLines++; // starting with a new line
  423. memset(render_buf_sLines[render_buf.nLines-1], 0, 256);
  424. }
  425. void get_tagend_str(char *pTagBuf, char **pText, int *lenText)
  426. {
  427. char c;
  428. int bDone = 0;
  429. if (*lenText > 0)
  430. {
  431. (*pText)++; // skip <
  432. (*lenText)--;
  433. *pTagBuf++ = '<';
  434. *pTagBuf++ = '/';
  435. }
  436. while (!bDone && *lenText > 0)
  437. {
  438. c = **pText;
  439. if (c == '>' || c == ' ' || c == '\r' || c == '\n' || c == '\t')
  440. {
  441. bDone = 1;
  442. }
  443. else
  444. {
  445. *pTagBuf++ = c;
  446. (*pText)++;
  447. (*lenText)--;
  448. }
  449. }
  450. *pTagBuf++ = '>';
  451. *pTagBuf = '\0';
  452. }
  453. int find_next_tag(char *pText, int lenText, int *lenBeforeTag, char **pTagDesc, int *lenTagDesc,
  454. char **pTagContent, int *lenTagContent, char **pAfterTag, int *lenAfterTag)
  455. {
  456. int i;
  457. int bFound;
  458. int idxTag;
  459. char *p;
  460. char *pTagEnd;
  461. char bufTagEnd[MAX_TAG_LEN];
  462. bFound = 0;
  463. idxTag = -1;
  464. *lenBeforeTag = 0;
  465. *lenTagDesc = 0;
  466. *lenTagContent = 0;
  467. *lenAfterTag = 0;
  468. while (lenText > 0 && !bFound)
  469. {
  470. p = strnchr(pText, '<', lenText);
  471. while (p && lenText > 1 && (p[1] == ' ' || p[1] == '\n'))
  472. p = strnchr(p + 1, '<', lenText - (p - pText) - 1);
  473. if (p)
  474. {
  475. *lenBeforeTag = p - pText;
  476. lenText -= *lenBeforeTag;
  477. pText = p;
  478. for (i=0; i < MAX_TAG_PAIRS && !bFound; i++)
  479. {
  480. if (!strncmp(pText, tag_pairs[i].sTagStart, tag_pairs[i].lenTagStart) &&
  481. (pText[tag_pairs[i].lenTagStart] == '>' || pText[tag_pairs[i].lenTagStart] == ' ' ||
  482. pText[tag_pairs[i].lenTagStart] == '\r' || pText[tag_pairs[i].lenTagStart] == '\n' ||
  483. pText[tag_pairs[i].lenTagStart] == '\t' || pText[tag_pairs[i].lenTagStart] == '/'))
  484. {
  485. bFound = 1;
  486. idxTag = i;
  487. pText += tag_pairs[i].lenTagStart;
  488. lenText -= tag_pairs[i].lenTagStart;
  489. if (tag_pairs[idxTag].lenTagEnd)
  490. pTagEnd = tag_pairs[idxTag].sTagEnd;
  491. else
  492. {
  493. bufTagEnd[0] = '\0';
  494. pTagEnd = bufTagEnd;
  495. }
  496. }
  497. }
  498. if (!bFound)
  499. {
  500. bFound = 1;
  501. idxTag = TAG_PAIR_UNKNOWN;
  502. if (*(pText + 1) == '/')
  503. bufTagEnd[0] = '\0';
  504. else
  505. get_tagend_str(bufTagEnd, &pText, &lenText); // build end tag and move pText to the end of start tag
  506. pTagEnd = bufTagEnd;
  507. }
  508. p = strnchr(pText, '>', lenText);
  509. if (p)
  510. {
  511. if (*(p - 1) == '/')
  512. {
  513. bufTagEnd[0] = '\0';
  514. pTagEnd = bufTagEnd;
  515. }
  516. *pTagDesc = pText + 1;
  517. *lenTagDesc = p - *pTagDesc;
  518. if (*pTagEnd)
  519. {
  520. *pTagContent = p + 1;
  521. lenText -= *pTagContent - pText;
  522. p = strnstr(*pTagContent, pTagEnd, lenText);
  523. if (p)
  524. {
  525. *lenTagContent = p - *pTagContent;
  526. pText = p + strlen(pTagEnd);
  527. lenText -= p + strnlen(pTagEnd) - *pTagContent;
  528. }
  529. else
  530. {
  531. *lenTagContent = lenText;
  532. lenText = 0;
  533. }
  534. }
  535. else
  536. {
  537. lenText -= p + 1 - pText;
  538. pText = p + 1;
  539. }
  540. }
  541. else
  542. {
  543. *lenBeforeTag += lenText;
  544. lenText = 0;
  545. }
  546. }
  547. else
  548. {
  549. *lenBeforeTag += lenText;
  550. lenText = 0;
  551. }
  552. }
  553. *lenAfterTag = lenText;
  554. *pAfterTag = pText;
  555. return idxTag;
  556. }
  557. int build_child_tree(int idxPreviousNode, int idxMyTag, char *pTagDesc, int lenTagDesc, char *pText, int lenText)
  558. {
  559. int idxChildTag;
  560. int idxMyNode;
  561. int idxChildNode;
  562. int idxPreviousChild;
  563. int lenBeforeTag;
  564. char *pTagContent;
  565. int lenTagContent;
  566. char *pAfterTag;
  567. int lenAfterTag;
  568. idxMyNode = nArticleNodeCount++;
  569. if (idxMyNode >= MAX_ARTICLE_NODES)
  570. {
  571. showMsg(0, "Max article nodes reached\n");
  572. exit(-1);
  573. }
  574. article_nodes[idxMyNode].idxTag = idxMyTag;
  575. article_nodes[idxMyNode].pTagDesc = pTagDesc;
  576. article_nodes[idxMyNode].lenTagDesc = lenTagDesc;
  577. article_nodes[idxMyNode].pContent = pText;
  578. article_nodes[idxMyNode].len = 0;
  579. article_nodes[idxMyNode].idxChildNode = 0;
  580. article_nodes[idxMyNode].idxNextNode = 0;
  581. if (idxPreviousNode > 0)
  582. {
  583. article_nodes[idxPreviousNode].idxNextNode = idxMyNode;
  584. }
  585. if (tag_pairs[idxMyTag].bGotChild)
  586. {
  587. idxPreviousChild = 0;
  588. idxChildTag = find_next_tag(pText, lenText, &lenBeforeTag, &pTagDesc, &lenTagDesc, &pTagContent, &lenTagContent, &pAfterTag, &lenAfterTag);
  589. article_nodes[idxMyNode].len = lenBeforeTag;
  590. while (idxChildTag >= 0)
  591. {
  592. idxChildNode = build_child_tree(idxPreviousChild, idxChildTag, pTagDesc, lenTagDesc, pTagContent, lenTagContent);
  593. if (!article_nodes[idxMyNode].idxChildNode)
  594. {
  595. article_nodes[idxMyNode].idxChildNode = idxChildNode;
  596. }
  597. idxPreviousChild = idxChildNode;
  598. pText = pAfterTag;
  599. lenText = lenAfterTag;
  600. idxChildTag = find_next_tag(pText, lenText, &lenBeforeTag, &pTagDesc, &lenTagDesc, &pTagContent, &lenTagContent, &pAfterTag, &lenAfterTag);
  601. if (lenBeforeTag > 0)
  602. {
  603. article_nodes[nArticleNodeCount].idxTag = -1;
  604. article_nodes[nArticleNodeCount].pTagDesc = NULL;
  605. article_nodes[nArticleNodeCount].lenTagDesc = 0;
  606. article_nodes[nArticleNodeCount].pContent = pText;
  607. article_nodes[nArticleNodeCount].len = lenBeforeTag;
  608. article_nodes[nArticleNodeCount].idxChildNode = 0;
  609. article_nodes[nArticleNodeCount].idxNextNode = 0;
  610. if (idxPreviousChild)
  611. {
  612. article_nodes[idxPreviousChild].idxNextNode = nArticleNodeCount;
  613. }
  614. else
  615. article_nodes[idxMyNode].idxNextNode = nArticleNodeCount;
  616. idxPreviousChild = nArticleNodeCount;
  617. nArticleNodeCount++;
  618. }
  619. }
  620. }
  621. else
  622. {
  623. article_nodes[idxMyNode].len = lenText;
  624. }
  625. return idxMyNode;
  626. }
  627. struct ampersand_char
  628. {
  629. char *pIn;
  630. char *pOut;
  631. } ampersand_chars[] = {
  632. {"", ""}
  633. };
  634. long replace_ampersand_char_crlf(char *pDest, char *pSrc)
  635. {
  636. char *pSemicolon;
  637. ucs4_t u;
  638. char sUnicode[12];
  639. long len = 0;
  640. char *p = pSrc;
  641. while (*pSrc)
  642. {
  643. if (!strncmp(pSrc, "&gt;", 4))
  644. {
  645. *pDest++ = '>';
  646. len++;
  647. pSrc += 4;
  648. }
  649. else if (!strncmp(pSrc, "&lt;", 4))
  650. {
  651. *pDest++ = '<';
  652. len++;
  653. pSrc += 4;
  654. }
  655. else if (!strncmp(pSrc, "&quot;", 6))
  656. {
  657. *pDest++ = '"';
  658. len++;
  659. pSrc += 6;
  660. }
  661. else if (!strncmp(pSrc, "&amp;nbsp;", 10))
  662. {
  663. *pDest++ = ' ';
  664. len++;
  665. pSrc += 10;
  666. }
  667. else if (!strncmp(pSrc, "&amp;times;", 11))
  668. {
  669. *pDest++ = ' ';
  670. len++;
  671. pSrc += 11;
  672. }
  673. else if (!strncmp(pSrc, "&amp;", 5))
  674. {
  675. *pDest++ = '&';
  676. len++;
  677. pSrc += 5;
  678. }
  679. else if (!strncmp(pSrc, "&#", 2))
  680. {
  681. pSemicolon = strchr(pSrc, ';');
  682. if (pSemicolon && pSemicolon - (pSrc + 2) < sizeof(sUnicode))
  683. {
  684. strncpy(sUnicode, pSrc + 2, pSemicolon - (pSrc + 2));
  685. sUnicode[pSemicolon - (pSrc + 2)] = '\0';
  686. u = atol(sUnicode);
  687. UCS4_to_UTF8(u, sUnicode);
  688. strcpy(pDest, sUnicode);
  689. pDest += strlen(sUnicode);
  690. len += strlen(sUnicode);
  691. pSrc = pSemicolon + 1;
  692. }
  693. else
  694. {
  695. pSrc++;
  696. pDest++;
  697. len++;
  698. }
  699. }
  700. else if (!memcmp(pSrc, "\n\n", 2))
  701. {
  702. strncpy(pDest, "<br>", 4);
  703. pDest += 4;
  704. len += 4;
  705. pSrc += 2;
  706. }
  707. else if (*pSrc == '\n')
  708. {
  709. if (len > 0 && *(pDest - 1) != ' ' && *(pSrc+1) != ' ')
  710. {
  711. *pDest++ = ' ';
  712. len++;
  713. }
  714. pSrc++;
  715. }
  716. else if (*pSrc == '\r')
  717. pSrc++;
  718. else
  719. {
  720. *pDest++ = *pSrc++;
  721. len++;
  722. }
  723. }
  724. *pDest = '\0';
  725. return len;
  726. }
  727. void memrcpy(void *dest, void *src, int len) // memory copy starting from the last byte
  728. {
  729. char *d = (char*)dest;
  730. char *s = (char*)src;
  731. if (len >= 0)
  732. {
  733. d += len - 1;
  734. s += len - 1;
  735. while (len--)
  736. {
  737. *d = *s;
  738. d--;
  739. s--;
  740. }
  741. }
  742. }
  743. #define ALL_LANGUAGES "ar:als:an:ast:bn:bs:cy:zh-min-nan:be-x-old:br:bg:ca:cs:da:pdc:de:et:el:es:eo:eu:fa:fo:fr:fy:gd:gl:ko:hi:hr:id:is:it:he:jv:ka:kk:sw:la:lt:lv:hu:mk:arz:ms:mn:nl:ja:no:nn:oc:pl:pt:ro:ru:sah:sco:simple:sk:sl:sr:sh:fi:sv:tl:ta:th:tr:uk:ur:ug:vi:war:yi:zh-yue:diq:bat-smg:zh:"
  744. #define MAX_LANGUAGE_STRING 16
  745. int unsupported_article(char *pHtmlFile)
  746. {
  747. if (strstr(pHtmlFile, " talk:") ||
  748. !strncmp(pHtmlFile, "Talk:", 5) || !strncmp(pHtmlFile, "User:", 5) ||
  749. !strncmp(pHtmlFile, "Wikipedia:", 10) || !strncmp(pHtmlFile, "Category:", 9) ||
  750. !strncmp(pHtmlFile, "File:", 5) || !strncmp(pHtmlFile, "Template:", 9) ||
  751. !strncmp(pHtmlFile, "Portal:", 7) || !strncmp(pHtmlFile, "Image:", 6) ||
  752. !strncmp(pHtmlFile, "MediaWiki:", 10))
  753. {
  754. return 1;
  755. }
  756. else
  757. {
  758. char s[MAX_LANGUAGE_STRING];
  759. char *p;
  760. p = strchr(pHtmlFile, ':');
  761. if (p && p - pHtmlFile < MAX_LANGUAGE_STRING - 1)
  762. {
  763. memcpy(s, pHtmlFile, p - pHtmlFile + 1);
  764. s[p - pHtmlFile] = '\0';
  765. if (strstr(ALL_LANGUAGES, s))
  766. return 1;
  767. }
  768. }
  769. return 0;
  770. }
  771. #define MAX_FOLDER_DEPTH 4
  772. #define MAX_SUBFOLDERS 200000
  773. #define MAX_FOLDER_NAME 512
  774. int get_subfolders_and_files(char *dir, int nDepth, char *folders[MAX_FOLDER_DEPTH][MAX_SUBFOLDERS],
  775. int folder_types[MAX_FOLDER_DEPTH][MAX_SUBFOLDERS])
  776. {
  777. int count = 0;
  778. struct dirent *de = NULL;
  779. DIR *d = NULL;
  780. if (nDepth >= MAX_FOLDER_DEPTH)
  781. {
  782. showMsg(0, "exceed maximum depth %s\n", dir);
  783. exit(-1);
  784. }
  785. d = opendir(dir);
  786. if(d)
  787. {
  788. while ((de = readdir(d)) != NULL)
  789. {
  790. if (de->d_name[0] != '.')
  791. {
  792. if (nDepth >= MAX_FOLDER_DEPTH - 1)
  793. de->d_type = DT_REG; // NFS does not return the correct type sometimes
  794. if (de->d_type != DT_REG || !strstr(de->d_name, ".html") || !unsupported_article(de->d_name))
  795. {
  796. if (count >= MAX_SUBFOLDERS)
  797. {
  798. showMsg(0, "too many files in folder %s\n", dir);
  799. exit(-1);
  800. }
  801. sprintf(folders[nDepth][count], "%s/%s", dir, de->d_name);
  802. folder_types[nDepth][count] = de->d_type;
  803. count++;
  804. }
  805. }
  806. }
  807. closedir(d);
  808. }
  809. return count;
  810. }
  811. char *folders[MAX_FOLDER_DEPTH][MAX_SUBFOLDERS];
  812. char *folder_names;
  813. int folder_types[MAX_FOLDER_DEPTH][MAX_SUBFOLDERS];
  814. int folder_counts[MAX_FOLDER_DEPTH];
  815. int folder_indexes[MAX_FOLDER_DEPTH];
  816. int current_folder_depth = -1;
  817. char *next_html_files(char *dir)
  818. {
  819. static char file_name[MAX_FOLDER_NAME];
  820. int bDone = 0;
  821. file_name[0] = '\0';
  822. if (current_folder_depth < 0)
  823. {
  824. int i, j;
  825. folder_names = malloc(MAX_FOLDER_DEPTH * MAX_SUBFOLDERS * MAX_FOLDER_NAME);
  826. if (!folder_names)
  827. {
  828. showMsg(0, "malloc folder_names error\n");
  829. exit(-1);
  830. }
  831. for (i = 0; i < MAX_FOLDER_DEPTH; i++)
  832. for (j = 0; j < MAX_SUBFOLDERS; j++)
  833. folders[i][j] = folder_names + (i * MAX_SUBFOLDERS + j) * MAX_FOLDER_NAME;
  834. current_folder_depth++;
  835. folder_counts[current_folder_depth] = get_subfolders_and_files(dir, current_folder_depth, folders, folder_types);
  836. if (folder_counts[current_folder_depth] == 0)
  837. bDone = 1;
  838. else
  839. {
  840. folder_indexes[current_folder_depth] = 0;
  841. }
  842. }
  843. if (!bDone && folder_indexes[current_folder_depth] < folder_counts[current_folder_depth])
  844. {
  845. int i = folder_indexes[current_folder_depth];
  846. if (folder_types[current_folder_depth][i] == DT_DIR || folder_types[current_folder_depth][i] == DT_UNKNOWN)
  847. {
  848. current_folder_depth++;
  849. folder_counts[current_folder_depth] = get_subfolders_and_files(folders[current_folder_depth-1][i],
  850. current_folder_depth, folders, folder_types);
  851. if (folder_counts[current_folder_depth] == 0)
  852. {
  853. current_folder_depth--;
  854. folder_indexes[current_folder_depth]++;
  855. return next_html_files(folders[current_folder_depth][folder_indexes[current_folder_depth]]);
  856. }
  857. folder_indexes[current_folder_depth] = 0;
  858. next_html_files(folders[current_folder_depth][0]);
  859. if (file_name[0])
  860. bDone = 1;
  861. }
  862. else if (folder_types[current_folder_depth][i] == DT_REG)
  863. {
  864. strcpy(file_name, folders[current_folder_depth][i]);
  865. folder_indexes[current_folder_depth]++;
  866. bDone = 1;
  867. }
  868. }
  869. // if not done, go to the parent folder's next folder
  870. if (!bDone)
  871. {
  872. current_folder_depth--;
  873. if (current_folder_depth < 0)
  874. return file_name;
  875. else
  876. {
  877. folder_indexes[current_folder_depth]++;
  878. return next_html_files(folders[current_folder_depth][folder_indexes[current_folder_depth]]);
  879. }
  880. }
  881. return file_name;
  882. }
  883. void get_file_name_from_path(char *sFile, char *sPath)
  884. {
  885. char *pLastSlash;
  886. char *pLastDot;
  887. int len;
  888. sFile[0] = '\0';
  889. len = 0;
  890. pLastSlash = strrchr(sPath, '/');
  891. if (pLastSlash)
  892. {
  893. pLastDot = strrchr(pLastSlash, '.');
  894. if (pLastDot)
  895. {
  896. len = pLastDot - pLastSlash - 1;
  897. sFile = strncpy(sFile, pLastSlash + 1, len);
  898. sFile[len] = '\0';
  899. }
  900. else
  901. {
  902. strcpy(sFile, pLastSlash + 1);
  903. len = strlen(sFile);
  904. }
  905. }
  906. // truncation the trailing seq number in the format of _xxxx where x is the hex number
  907. if (len > 5 && sFile[len-5] == '_' &&
  908. (('0' <= sFile[len-4] && sFile[len-4] <= '9') || ('a' <= sFile[len-4] && sFile[len-4] <= 'f')) &&
  909. (('0' <= sFile[len-3] && sFile[len-3] <= '9') || ('a' <= sFile[len-3] && sFile[len-3] <= 'f')) &&
  910. (('0' <= sFile[len-2] && sFile[len-2] <= '9') || ('a' <= sFile[len-2] && sFile[len-2] <= 'f')) &&
  911. (('0' <= sFile[len-1] && sFile[len-1] <= '9') || ('a' <= sFile[len-1] && sFile[len-1] <= 'f')))
  912. sFile[len-5] = '\0';
  913. }
  914. void get_title(char *pHtmlFile, char *sTitle, char *sTitleSearch, char *sFirstTwoChars, char *sSecondTwoChars)
  915. {
  916. get_file_name_from_path(sTitle, pHtmlFile);
  917. build_title_search(sTitle, sTitleSearch, sFirstTwoChars, sSecondTwoChars);
  918. }
  919. void article_info(char *pHtmlFile, char *sRedirect, int *nType)
  920. {
  921. FILE *fd;
  922. char buf[1024];
  923. int len;
  924. char *p, *p2;
  925. *nType = -1;
  926. fd = fopen(pHtmlFile, "rb");
  927. if (!fd)
  928. {
  929. showMsg(0, "cannot open file %s, error: %s\n", pHtmlFile, strerror(errno));
  930. exit(-1);
  931. }
  932. len = fread(buf, 1, 1023, fd);
  933. buf[len] = '\0';
  934. sRedirect[0] = '\0';
  935. p = strstr(buf, "<p>Redirecting to <a href=");
  936. if (p)
  937. {
  938. p += 27; // skip <p>Redirecting to <a href="
  939. p2 = strchr(p, '"');
  940. if (p2)
  941. {
  942. *p2 = '\0';
  943. if (strstr(p, "../articles/"))
  944. {
  945. *nType = 1;
  946. get_file_name_from_path(sRedirect, p);
  947. url_decode(sRedirect);
  948. }
  949. }
  950. }
  951. else
  952. *nType = 0;
  953. if (fd)
  954. fclose(fd);
  955. }
  956. void render_newline(int space)
  957. {
  958. if (render_buf.current_x >= 0)
  959. {
  960. render_buf.current_y += render_buf.nCurrentLineHeight;
  961. render_buf.nCurrentLineIdx = 0;
  962. render_buf.nLines++;
  963. memset(render_buf_sLines[render_buf.nLines-1], 0, 256);
  964. }
  965. if (space)
  966. {
  967. render_buf_sLines[render_buf.nLines-1][render_buf.nCurrentLineIdx++] = (char)ESC_0_SPACE_LINE;
  968. render_buf_sLines[render_buf.nLines-1][render_buf.nCurrentLineIdx++] = space;
  969. render_buf_sLines[render_buf.nLines-1][render_buf.nCurrentLineIdx] = '\0';
  970. render_buf.current_y += space;
  971. }
  972. render_buf.current_x = -1;
  973. render_buf.nCurrentLineIdx = 0;
  974. render_buf.nLines++;
  975. memset(render_buf_sLines[render_buf.nLines-1], 0, 256);
  976. // render_buf.nCurrentLineHeight = pcfFonts[render_buf.nCurrentFontIdx - 1].Fmetrics.linespace;
  977. // if (render_buf.nCurrentFontIdx == DEFAULT_FONT_IDX)
  978. // {
  979. // render_buf_sLines[render_buf.nLines-1][render_buf.nCurrentLineIdx++] =
  980. // (char)ESC_1_NEW_LINE_DEFAULT_FONT;
  981. // }
  982. // else
  983. // {
  984. // render_buf_sLines[render_buf.nLines-1][render_buf.nCurrentLineIdx++] =
  985. // (char)ESC_3_NEW_LINE_WITH_FONT;
  986. // render_buf_sLines[render_buf.nLines-1][render_buf.nCurrentLineIdx++] =
  987. // (char)(render_buf.nCurrentFontIdx | (render_buf.nCurrentLineHeight << 3));
  988. // }
  989. // render_buf_sLines[render_buf.nLines-1][render_buf.nCurrentLineIdx] = '\0';
  990. }
  991. #define SIZE_RENDER_STRING_BUF 32000
  992. void render_string(char *pContentCurrent, long lenToProcess, int nFontIdx, int nType, char *link_str)
  993. {
  994. int nTextLenFitWidth;
  995. char sBuf[SIZE_RENDER_STRING_BUF];
  996. char *pBuf;
  997. if (nFontIdx >= 0)
  998. render_buf.nCurrentFontIdx = nFontIdx;
  999. if (nType >= 0)
  1000. render_buf.nCurrentRenderType = nType;
  1001. if (lenToProcess >= SIZE_RENDER_STRING_BUF)
  1002. lenToProcess = SIZE_RENDER_STRING_BUF - 1;
  1003. strncpy(sBuf, pContentCurrent, lenToProcess);
  1004. sBuf[lenToProcess] = '\0';
  1005. pBuf = sBuf;
  1006. while (*pBuf)
  1007. {
  1008. showMsg(6, "[%c%c%c%c] - (%d, %d)\n", pBuf[0], pBuf[1], pBuf[2], pBuf[3], render_buf.current_x, render_buf.current_y);
  1009. if (render_buf.nCurrentRenderType == RENDER_TYPE_LINK && link_str && render_buf.nLinks < MAX_LINKS)
  1010. {
  1011. if (render_buf.current_x < 0)
  1012. links[render_buf.nLinks].start_x = 0;
  1013. else
  1014. links[render_buf.nLinks].start_x = render_buf.current_x;
  1015. if (links[render_buf.nLinks].start_x < render_buf.current_indent)
  1016. links[render_buf.nLinks].start_x = render_buf.current_indent;
  1017. links[render_buf.nLinks].start_y = render_buf.current_y;
  1018. links[render_buf.nLinks].bWikiLink = 0;
  1019. strncpy(links[render_buf.nLinks].sLink, link_str, MAX_LINK_STRING - 1);
  1020. links[render_buf.nLinks].sLink[MAX_LINK_STRING - 1] = '\0';
  1021. }
  1022. if (render_buf.current_x < 0) /* new line */
  1023. {
  1024. render_buf.current_x = 0;
  1025. if (*pBuf == ' ')
  1026. pBuf++;
  1027. render_buf.nCurrentLineHeight = pcfFonts[render_buf.nCurrentFontIdx - 1].Fmetrics.linespace + LINE_SPACE_ADDON;
  1028. if (render_buf.nCurrentFontIdx == DEFAULT_FONT_IDX)
  1029. {
  1030. render_buf_sLines[render_buf.nLines-1][render_buf.nCurrentLineIdx++] =
  1031. (char)ESC_1_NEW_LINE_DEFAULT_FONT;
  1032. }
  1033. else
  1034. {
  1035. render_buf_sLines[render_buf.nLines-1][render_buf.nCurrentLineIdx++] =
  1036. (char)ESC_3_NEW_LINE_WITH_FONT;
  1037. render_buf_sLines[render_buf.nLines-1][render_buf.nCurrentLineIdx++] =
  1038. (char)(render_buf.nCurrentFontIdx | (render_buf.nCurrentLineHeight << 3));
  1039. }
  1040. }
  1041. if (render_buf.current_x < render_buf.current_indent)
  1042. {
  1043. render_buf_sLines[render_buf.nLines-1][render_buf.nCurrentLineIdx++] = (char)ESC_7_FORWARD;
  1044. render_buf_sLines[render_buf.nLines-1][render_buf.nCurrentLineIdx++] = (char)(render_buf.current_indent - render_buf.current_x);
  1045. render_buf.current_x = render_buf.current_indent;
  1046. }
  1047. nTextLenFitWidth = srting_len_fit_width(render_buf.nCurrentFontIdx, &render_buf.current_x, LCD_BUF_WIDTH_PIXELS - LCD_LEFT_MARGIN, pBuf, strlen(pBuf));
  1048. if (nTextLenFitWidth > 0)
  1049. {
  1050. strncpy(&render_buf_sLines[render_buf.nLines-1][render_buf.nCurrentLineIdx], pBuf, nTextLenFitWidth);
  1051. render_buf.nCurrentLineIdx += nTextLenFitWidth;
  1052. if (render_buf.nCurrentRenderType == RENDER_TYPE_LINK && link_str && render_buf.nLinks < MAX_LINKS)
  1053. {
  1054. if (render_buf.current_x - links[render_buf.nLinks].start_x > 0)
  1055. {
  1056. render_buf_sLines[render_buf.nLines-1][render_buf.nCurrentLineIdx++] = ESC_10_HORIZONTAL_LINE;
  1057. render_buf_sLines[render_buf.nLines-1][render_buf.nCurrentLineIdx++] = render_buf.current_x - links[render_buf.nLinks].start_x;
  1058. }
  1059. links[render_buf.nLinks].end_x = render_buf.current_x - 1;
  1060. links[render_buf.nLinks].end_y = render_buf.current_y + render_buf.nCurrentLineHeight - 1;
  1061. render_buf.nLinks++;
  1062. }
  1063. render_buf_sLines[render_buf.nLines-1][render_buf.nCurrentLineIdx] = '\0';
  1064. pBuf += nTextLenFitWidth;
  1065. }
  1066. if (*pBuf)
  1067. {
  1068. render_buf.current_y += render_buf.nCurrentLineHeight;
  1069. render_buf.current_x = -1;
  1070. render_buf.nCurrentLineIdx = 0;
  1071. render_buf.nLines++;
  1072. memset(render_buf_sLines[render_buf.nLines-1], 0, 256);
  1073. if (render_buf.nCurrentRenderType == RENDER_TYPE_LINK && link_str && render_buf.nLinks < MAX_LINKS)
  1074. {
  1075. if (render_buf.current_x < 0)
  1076. links[render_buf.nLinks].start_x = 0;
  1077. else
  1078. links[render_buf.nLinks].start_x = render_buf.current_x;
  1079. links[render_buf.nLinks].start_y = render_buf.current_y;
  1080. links[render_buf.nLinks].bWikiLink = 0;
  1081. strncpy(links[render_buf.nLinks].sLink, link_str, MAX_LINK_STRING - 1);
  1082. links[render_buf.nLinks].sLink[MAX_LINK_STRING - 1] = '\0';
  1083. }
  1084. // else if (render_buf.nCurrentRenderType == RENDER_TYPE_TABLE_DATA)
  1085. // {
  1086. // render_buf_sLines[render_buf.nLines-1][render_buf.nCurrentLineIdx++] = ESC_12_FULL_HORIZONTAL_LINE;
  1087. // render_buf.current_y += 1;
  1088. // }
  1089. render_buf_sLines[render_buf.nLines-1][render_buf.nCurrentLineIdx] = '\0';
  1090. }
  1091. }
  1092. }
  1093. void get_key_value(char *value, int val_len, char *key, char *str, int len)
  1094. {
  1095. char *p, *p2;
  1096. value[0] = '\0';
  1097. p = strnstr(str, key, len);
  1098. if (p)
  1099. {
  1100. p += strlen(key);
  1101. if (p[0] == '=' && (p[1] == '"' || p[1] =='\''))
  1102. {
  1103. p += 2;
  1104. if (p[1] == '"')
  1105. p2 = strnchr(p, '"', len - strlen(key) - 2);
  1106. else
  1107. p2 = strnchr(p, '\'', len - strlen(key) - 2);
  1108. if (p2)
  1109. {
  1110. int copy_len;
  1111. if (val_len - 1 > p2 - p)
  1112. copy_len = p2 - p;
  1113. else
  1114. copy_len = val_len - 1;
  1115. strncpy(value, p, copy_len);
  1116. value[copy_len] = '\0';
  1117. }
  1118. }
  1119. }
  1120. // url_decode(value);
  1121. }
  1122. void render_sub_title_node(int idxNode)
  1123. {
  1124. if (render_buf.current_x >= 0)
  1125. {
  1126. render_buf.current_y += render_buf.nCurrentLineHeight;
  1127. render_buf.current_x = -1;
  1128. render_buf.nCurrentLineIdx = 0;
  1129. render_buf.nLines++;
  1130. memset(render_buf_sLines[render_buf.nLines-1], 0, 256);
  1131. }
  1132. switch (article_nodes[idxNode].idxTag)
  1133. {
  1134. case TAG_PAIR_H2:
  1135. render_text_node(idxNode, H2_FONT_IDX, RENDER_TYPE_NORMAL);
  1136. // render_buf_sLines[render_buf.nLines-1][render_buf.nCurrentLineIdx++] = (char)ESC_12_FULL_HORIZONTAL_LINE;
  1137. // render_buf_sLines[render_buf.nLines-1][render_buf.nCurrentLineIdx++] = (char)ESC_0_SPACE_LINE;
  1138. // render_buf_sLines[render_buf.nLines-1][render_buf.nCurrentLineIdx++] = 2;
  1139. // render_buf_sLines[render_buf.nLines-1][render_buf.nCurrentLineIdx] = '\0';
  1140. // render_buf.current_y += 3;
  1141. break;
  1142. case TAG_PAIR_H3:
  1143. render_text_node(idxNode, H3_FONT_IDX, RENDER_TYPE_NORMAL);
  1144. break;
  1145. case TAG_PAIR_H4:
  1146. render_text_node(idxNode, H4_FONT_IDX, RENDER_TYPE_NORMAL);
  1147. break;
  1148. case TAG_PAIR_H5:
  1149. render_text_node(idxNode, H5_FONT_IDX, RENDER_TYPE_NORMAL);
  1150. break;
  1151. default:
  1152. render_text_node(idxNode, DEFAULT_FONT_IDX, RENDER_TYPE_NORMAL);
  1153. break;
  1154. }
  1155. render_buf.current_y += render_buf.nCurrentLineHeight;
  1156. render_buf.current_x = -1;
  1157. render_buf.nCurrentLineIdx = 0;
  1158. render_buf.nLines++;
  1159. memset(render_buf_sLines[render_buf.nLines-1], 0, 256);
  1160. }
  1161. void dump_wiki_nodes()
  1162. {
  1163. int i;
  1164. int j;
  1165. int level = 0;
  1166. for (i = 0; i < nWikiNodeCount; i++)
  1167. {
  1168. printf("|node idx[%d] tag[%d] %d tag[",
  1169. i, wiki_nodes[i].idxTag, wiki_nodes[i].bTagStart);
  1170. for (j = 0; j < wiki_nodes[i].lenTag; j++)
  1171. printf("%c", wiki_nodes[i].pTag[j]);
  1172. printf("] len[%d]\n", wiki_nodes[i].lenTag);
  1173. }
  1174. }
  1175. void dump_article_node(int idxNode, int level)
  1176. {
  1177. int i;
  1178. for (i = 0; i < level; i++)
  1179. printf("-");
  1180. printf("|node idx[%d] child[%d] next[%d] tag[",
  1181. idxNode, article_nodes[idxNode].idxChildNode, article_nodes[idxNode].idxNextNode);
  1182. if (0 <= article_nodes[idxNode].idxTag && article_nodes[idxNode].idxTag < MAX_TAG_PAIRS)
  1183. printf("%s] desc[", tag_pairs[article_nodes[idxNode].idxTag].sTagStart);
  1184. else
  1185. printf("%d] desc[", article_nodes[idxNode].idxTag);
  1186. for (i=0; i < article_nodes[idxNode].lenTagDesc; i++)
  1187. printf("%c", article_nodes[idxNode].pTagDesc[i]);
  1188. printf("] len[%d]\n", article_nodes[idxNode].len);
  1189. for (i = 0; i < level; i++)
  1190. printf(" ");
  1191. printf("|{");
  1192. for (i=0; i< article_nodes[idxNode].len; i++)
  1193. printf("%c", article_nodes[idxNode].pContent[i]);
  1194. printf("}\n");
  1195. }
  1196. void dump_article_node_and_children(int idxNode, int level)
  1197. {
  1198. int childNode;
  1199. int nextNode;
  1200. dump_article_node(idxNode, level);
  1201. if (article_nodes[idxNode].idxChildNode > 0)
  1202. dump_article_node_and_children(article_nodes[idxNode].idxChildNode, level + 1);
  1203. if (article_nodes[idxNode].idxNextNode > 0)
  1204. dump_article_node_and_children(article_nodes[idxNode].idxNextNode, level);
  1205. }
  1206. void render_text_node(int idxNode, int nFontIdx, int nRenderType)
  1207. {
  1208. int nOrigFontIdx;
  1209. int nOrigRenderType;
  1210. int idxNextChildNode;
  1211. nOrigFontIdx = render_buf.nCurrentFontIdx;
  1212. nOrigRenderType = render_buf.nCurrentRenderType;
  1213. if (nFontIdx >= 0)
  1214. render_buf.nCurrentFontIdx = nFontIdx;
  1215. if (nRenderType >= 0)
  1216. render_buf.nCurrentRenderType = nRenderType;
  1217. if (article_nodes[idxNode].len > 0 && !all_blanks(article_nodes[idxNode].pContent, article_nodes[idxNode].len))
  1218. render_string(article_nodes[idxNode].pContent, article_nodes[idxNode].len,
  1219. render_buf.nCurrentFontIdx, render_buf.nCurrentRenderType, NULL);
  1220. idxNextChildNode = article_nodes[idxNode].idxChildNode;
  1221. if (idxNextChildNode)
  1222. {
  1223. render_article_node(idxNextChildNode);
  1224. // idxNextChildNode = article_nodes[idxNextChildNode].idxNextNode;
  1225. }
  1226. render_buf.nCurrentFontIdx = nOrigFontIdx;
  1227. render_buf.nCurrentRenderType = nOrigRenderType;
  1228. }
  1229. void render_LI(int idxNode)
  1230. {
  1231. int indent;
  1232. char *pForwardPixels;
  1233. char li_str[10];
  1234. int nDepth = 0;
  1235. int i;
  1236. if (render_buf.list_depth > 0)
  1237. indent = (render_buf.list_depth - 1) * PIXELS_PER_INDENT + PIXELS_FIRST_INDENT;
  1238. else
  1239. indent = 0;
  1240. if (indent > LCD_BUF_WIDTH_PIXELS / 3 * 2)
  1241. indent = LCD_BUF_WIDTH_PIXELS / 3 * 2;
  1242. if (render_buf.list_depth > 0 && render_buf.list_type[render_buf.list_depth - 1] == 'O')
  1243. {
  1244. render_buf.ol_count[render_buf.list_depth - 1]++;
  1245. for (i = 0; i < render_buf.list_depth; i++)
  1246. {
  1247. if (render_buf.list_type[i] == 'O')
  1248. nDepth++;
  1249. }
  1250. switch (nDepth)
  1251. {
  1252. case 1:
  1253. sprintf(li_str, "%d. ", render_buf.ol_count[render_buf.list_depth - 1]);
  1254. break;
  1255. case 2:
  1256. if (render_buf.ol_count[render_buf.list_depth - 1] > (26 * 27))
  1257. sprintf(li_str, "%c%c%c. ", 'A' + (render_buf.ol_count[render_buf.list_depth - 1] - 26 * 26 - 1) / (26 * 26),
  1258. 'A' + ((render_buf.ol_count[render_buf.list_depth - 1] - 27) % (26 * 26)) / 26,
  1259. 'A' + ((render_buf.ol_count[render_buf.list_depth - 1] - 1) % 26));
  1260. else if (render_buf.ol_count[render_buf.list_depth - 1] > 26)
  1261. sprintf(li_str, "%c%c. ", 'A' + (render_buf.ol_count[render_buf.list_depth - 1] - 27) / 26,
  1262. 'A' + ((render_buf.ol_count[render_buf.list_depth - 1] - 1)% 26));
  1263. else
  1264. sprintf(li_str, "%c. ", 'A' + render_buf.ol_count[render_buf.list_depth - 1] - 1);
  1265. break;
  1266. default:
  1267. if (render_buf.ol_count[render_buf.list_depth - 1] > 26 * 27)
  1268. sprintf(li_str, "%c%c%c. ", 'a' + (render_buf.ol_count[render_buf.list_depth - 1] - 26 * 26 - 1) / (26 * 26),
  1269. 'a' + ((render_buf.ol_count[render_buf.list_depth - 1] - 27) % 26 * 26) / 26,
  1270. 'a' + ((render_buf.ol_count[render_buf.list_depth - 1] - 1) % 26));
  1271. else if (render_buf.ol_count[render_buf.list_depth - 1] > 26)
  1272. sprintf(li_str, "%c%c. ", 'a' + (render_buf.ol_count[render_buf.list_depth - 1] - 27) / 26,
  1273. 'a' + ((render_buf.ol_count[render_buf.list_depth - 1] - 1) % 26));
  1274. else
  1275. sprintf(li_str, "%c. ", 'a' + render_buf.ol_count[render_buf.list_depth - 1] - 1);
  1276. break;
  1277. }
  1278. }
  1279. else
  1280. {
  1281. for (i = 0; i < render_buf.list_depth; i++)
  1282. if (render_buf.list_type[render_buf.list_depth - 1] == 'U')
  1283. nDepth++;
  1284. switch (nDepth)
  1285. {
  1286. case 1:
  1287. li_str[0] = 0xE2;
  1288. li_str[1] = 0x96;
  1289. li_str[2] = 0xAA;
  1290. li_str[3] = ' ';
  1291. li_str[4] = '\0';
  1292. break;
  1293. case 2:
  1294. li_str[0] = 0xE2;
  1295. li_str[1] = 0x80;
  1296. li_str[2] = 0xA2;
  1297. li_str[3] = ' ';
  1298. li_str[4] = '\0';
  1299. break;
  1300. default:
  1301. li_str[0] = 0xE2;
  1302. li_str[1] = 0x97;
  1303. li_str[2] = 0xA6;
  1304. li_str[3] = ' ';
  1305. li_str[4] = '\0';
  1306. break;
  1307. }
  1308. }
  1309. if (render_buf.current_x < 0) /* new line */
  1310. {
  1311. render_buf.current_x = 0;
  1312. render_buf.nCurrentLineHeight = pcfFonts[render_buf.nCurrentFontIdx - 1].Fmetrics.linespace + LINE_SPACE_ADDON;
  1313. if (render_buf.nCurrentFontIdx == DEFAULT_FONT_IDX)
  1314. {
  1315. render_buf_sLines[render_buf.nLines-1][render_buf.nCurrentLineIdx++] =
  1316. (char)ESC_1_NEW_LINE_DEFAULT_FONT;
  1317. }
  1318. else
  1319. {
  1320. render_buf_sLines[render_buf.nLines-1][render_buf.nCurrentLineIdx++] =
  1321. (char)ESC_3_NEW_LINE_WITH_FONT;
  1322. render_buf_sLines[render_buf.nLines-1][render_buf.nCurrentLineIdx++] =
  1323. (char)(render_buf.nCurrentFontIdx | (render_buf.nCurrentLineHeight << 3));
  1324. }
  1325. }
  1326. render_buf_sLines[render_buf.nLines-1][render_buf.nCurrentLineIdx++] = (char)ESC_7_FORWARD;
  1327. pForwardPixels = &(render_buf_sLines[render_buf.nLines-1][render_buf.nCurrentLineIdx]);
  1328. render_buf_sLines[render_buf.nLines-1][render_buf.nCurrentLineIdx++] = 0;
  1329. render_buf.current_x = 0;
  1330. render_buf.current_indent = 0;
  1331. render_string(li_str, strlen(li_str), -1, -1, NULL);
  1332. if (render_buf.current_x < indent)
  1333. {
  1334. *pForwardPixels = (char)(indent - render_buf.current_x);
  1335. render_buf.current_x += indent - render_buf.current_x;
  1336. }
  1337. render_buf.current_indent = indent;
  1338. render_text_node(idxNode, -1, -1);
  1339. render_newline(0);
  1340. }
  1341. void render_DT(int idxNode)
  1342. {
  1343. int indent;
  1344. char *pForwardPixels;
  1345. if (render_buf.list_depth > 1)
  1346. indent = (render_buf.list_depth - 2) * PIXELS_PER_INDENT + PIXELS_FIRST_INDENT;
  1347. else
  1348. indent = 0;
  1349. if (indent > LCD_BUF_WIDTH_PIXELS / 3 * 2)
  1350. indent = LCD_BUF_WIDTH_PIXELS / 3 * 2;
  1351. if (render_buf.current_x < 0) /* new line */
  1352. {
  1353. render_buf.current_x = 0;
  1354. render_buf.nCurrentLineHeight = pcfFonts[render_buf.nCurrentFontIdx - 1].Fmetrics.linespace + LINE_SPACE_ADDON;
  1355. if (render_buf.nCurrentFontIdx == DEFAULT_FONT_IDX)
  1356. {
  1357. render_buf_sLines[render_buf.nLines-1][render_buf.nCurrentLineIdx++] =
  1358. (char)ESC_1_NEW_LINE_DEFAULT_FONT;
  1359. }
  1360. else
  1361. {
  1362. render_buf_sLines[render_buf.nLines-1][render_buf.nCurrentLineIdx++] =
  1363. (char)ESC_3_NEW_LINE_WITH_FONT;
  1364. render_buf_sLines[render_buf.nLines-1][render_buf.nCurrentLineIdx++] =
  1365. (char)(render_buf.nCurrentFontIdx | (render_buf.nCurrentLineHeight << 3));
  1366. }
  1367. }
  1368. render_buf.current_indent = indent;
  1369. render_text_node(idxNode, -1, -1);
  1370. render_newline(0);
  1371. }
  1372. void render_DD(int idxNode)
  1373. {
  1374. int indent;
  1375. char *pForwardPixels;
  1376. if (render_buf.list_depth > 0)
  1377. indent = (render_buf.list_depth - 1) * PIXELS_PER_INDENT + PIXELS_FIRST_INDENT;
  1378. else
  1379. indent = 0;
  1380. if (indent > LCD_BUF_WIDTH_PIXELS / 3 * 2)
  1381. indent = LCD_BUF_WIDTH_PIXELS / 3 * 2;
  1382. if (render_buf.current_x < 0) /* new line */
  1383. {
  1384. render_buf.current_x = 0;
  1385. render_buf.nCurrentLineHeight = pcfFonts[render_buf.nCurrentFontIdx - 1].Fmetrics.linespace + LINE_SPACE_ADDON;
  1386. if (render_buf.nCurrentFontIdx == DEFAULT_FONT_IDX)
  1387. {
  1388. render_buf_sLines[render_buf.nLines-1][render_buf.nCurrentLineIdx++] =
  1389. (char)ESC_1_NEW_LINE_DEFAULT_FONT;
  1390. }
  1391. else
  1392. {
  1393. render_buf_sLines[render_buf.nLines-1][render_buf.nCurrentLineIdx++] =
  1394. (char)ESC_3_NEW_LINE_WITH_FONT;
  1395. render_buf_sLines[render_buf.nLines-1][render_buf.nCurrentLineIdx++] =
  1396. (char)(render_buf.nCurrentFontIdx | (render_buf.nCurrentLineHeight << 3));
  1397. }
  1398. }
  1399. render_buf.current_indent = indent;
  1400. render_text_node(idxNode, -1, -1);
  1401. render_newline(0);
  1402. }
  1403. void render_node_with_font(int idxNode, int idxFont)
  1404. {
  1405. render_buf_sLines[render_buf.nLines-1][render_buf.nCurrentLineIdx++] =
  1406. (char)ESC_4_CHANGE_FONT;
  1407. render_buf.nCurrentFontIdx = idxFont;
  1408. render_buf_sLines[render_buf.nLines-1][render_buf.nCurrentLineIdx++] =
  1409. (char)(render_buf.nCurrentFontIdx);
  1410. render_text_node(idxNode, idxFont, -1);
  1411. render_buf.nCurrentFontIdx = DEFAULT_FONT_IDX;
  1412. render_buf_sLines[render_buf.nLines-1][render_buf.nCurrentLineIdx++] =
  1413. (char)ESC_5_RESET_TO_DEFAULT_FONT;
  1414. render_buf_sLines[render_buf.nLines-1][render_buf.nCurrentLineIdx] = '\0';
  1415. }
  1416. void render_table(int idxNode)
  1417. {
  1418. int idxNextChildNode;
  1419. if (render_buf.current_x >= 0)
  1420. {
  1421. render_buf.current_y += render_buf.nCurrentLineHeight;
  1422. render_buf.current_x = -1;
  1423. render_buf.nCurrentLineIdx = 0;
  1424. render_buf.nLines++;
  1425. memset(render_buf_sLines[render_buf.nLines-1], 0, 256);
  1426. }
  1427. if (article_nodes[idxNode].idxTag == TAG_PAIR_TABLE)
  1428. {
  1429. render_buf_sLines[render_buf.nLines-1][render_buf.nCurrentLineIdx++] = (char)ESC_0_SPACE_LINE;
  1430. render_buf_sLines[render_buf.nLines-1][render_buf.nCurrentLineIdx++] = 8;
  1431. render_buf.current_y += 8;
  1432. render_buf_sLines[render_buf.nLines-1][render_buf.nCurrentLineIdx] = '\0';
  1433. }
  1434. // render_buf_sLines[render_buf.nLines-1][render_buf.nCurrentLineIdx++] = (char)ESC_12_FULL_HORIZONTAL_LINE;
  1435. // render_buf_sLines[render_buf.nLines-1][render_buf.nCurrentLineIdx] = '\0';
  1436. // render_buf.current_y += 1;
  1437. if (article_nodes[idxNode].idxTag == TAG_PAIR_TR)
  1438. {
  1439. render_buf_sLines[render_buf.nLines-1][render_buf.nCurrentLineIdx++] = (char)ESC_1_NEW_LINE_DEFAULT_FONT;
  1440. render_buf_sLines[render_buf.nLines-1][render_buf.nCurrentLineIdx] = '\0';
  1441. render_buf.nCurrentFontIdx = DEFAULT_FONT_IDX;
  1442. render_buf.current_x = 0;
  1443. }
  1444. else
  1445. render_buf.current_x = -1;
  1446. render_buf.nCurrentLineIdx = 0;
  1447. render_buf.nCurrentLineHeight = pcfFonts[render_buf.nCurrentFontIdx - 1].Fmetrics.linespace + LINE_SPACE_ADDON;
  1448. render_buf.nLines++;
  1449. memset(render_buf_sLines[render_buf.nLines-1], 0, 256);
  1450. idxNextChildNode = article_nodes[idxNode].idxChildNode;
  1451. if (idxNextChildNode)
  1452. {
  1453. render_article_node(idxNextChildNode);
  1454. }
  1455. if (render_buf.current_x >= 0)
  1456. {
  1457. render_buf.current_y += render_buf.nCurrentLineHeight;
  1458. render_buf.current_x = -1;
  1459. render_buf.nCurrentLineIdx = 0;
  1460. render_buf.nLines++;
  1461. memset(render_buf_sLines[render_buf.nLines-1], 0, 256);
  1462. }
  1463. // render_buf_sLines[render_buf.nLines-1][render_buf.nCurrentLineIdx++] = (char)ESC_12_FULL_HORIZONTAL_LINE;
  1464. // render_buf.current_y += 1;
  1465. // if (article_nodes[idxNode].idxTag == TAG_PAIR_TABLE)
  1466. // {
  1467. // render_buf_sLines[render_buf.nLines-1][render_buf.nCurrentLineIdx++] = (char)ESC_0_SPACE_LINE;
  1468. // render_buf_sLines[render_buf.nLines-1][render_buf.nCurrentLineIdx++] = 8;
  1469. // render_buf.current_y += 8;
  1470. // render_buf_sLines[render_buf.nLines-1][render_buf.nCurrentLineIdx] = '\0';
  1471. // }
  1472. render_buf.current_x = -1;
  1473. render_buf.nCurrentLineIdx = 0;
  1474. render_buf.nCurrentLineHeight = pcfFonts[render_buf.nCurrentFontIdx - 1].Fmetrics.linespace + LINE_SPACE_ADDON;
  1475. render_buf.nLines++;
  1476. memset(render_buf_sLines[render_buf.nLines-1], 0, 256);
  1477. }
  1478. void render_TR(int idxNode)
  1479. {
  1480. render_table(idxNode);
  1481. }
  1482. void render_TD(int idxNode)
  1483. {
  1484. // if (render_buf.current_x > 0 && render_buf.current_x % TABLE_CELL_WIDTH) // cell not aligned to the cell width
  1485. // {
  1486. // int nForwardPixels = TABLE_CELL_WIDTH - (render_buf.current_x % TABLE_CELL_WIDTH);
  1487. // if (render_buf.current_x + nForwardPixels >= LCD_BUF_WIDTH_PIXELS)
  1488. // {
  1489. // render_buf.current_y += render_buf.nCurrentLineHeight;
  1490. // render_buf.current_x = -1;
  1491. // render_buf.nCurrentLineIdx = 0;
  1492. // render_buf.nLines++;
  1493. // render_buf_sLines[render_buf.nLines-1][0] = '\0';
  1494. // }
  1495. // else
  1496. // {
  1497. // render_buf_sLines[render_buf.nLines-1][render_buf.nCurrentLineIdx++] = (char)ESC_7_FORWARD;
  1498. // render_buf_sLines[render_buf.nLines-1][render_buf.nCurrentLineIdx++] = (char)(nForwardPixels);
  1499. // render_buf_sLines[render_buf.nLines-1][0] = '\0';
  1500. // render_buf.current_x += nForwardPixels;
  1501. // }
  1502. //
  1503. // }
  1504. render_text_node(idxNode, DEFAULT_FONT_IDX, RENDER_TYPE_TABLE_DATA);
  1505. if (article_nodes[idxNode].len)
  1506. render_string(" ", 2, DEFAULT_FONT_IDX, RENDER_TYPE_NORMAL, NULL);
  1507. // if (render_buf.current_x < LCD_BUF_WIDTH_PIXELS - 2)
  1508. // {
  1509. // render_buf_sLines[render_buf.nLines-1][render_buf.nCurrentLineIdx++] = (char)ESC_13_FULL_VERTICAL_LINE;
  1510. // render_buf.current_x += 4;
  1511. // }
  1512. // else
  1513. // {
  1514. // render_buf.current_x = 4;
  1515. // render_buf.nCurrentLineIdx = 0;
  1516. // render_buf.nLines++;
  1517. // render_buf_sLines[render_buf.nLines-1][render_buf.nCurrentLineIdx++] = (char)ESC_13_FULL_VERTICAL_LINE;
  1518. // }
  1519. render_buf_sLines[render_buf.nLines-1][render_buf.nCurrentLineIdx] = '\0';
  1520. }
  1521. void render_ref(int idxNode)
  1522. {
  1523. }
  1524. void render_line_break(void)
  1525. {
  1526. if (render_buf.current_x >= 0)
  1527. {
  1528. render_buf.current_y += render_buf.nCurrentLineHeight;
  1529. render_buf.current_x = -1;
  1530. render_buf.nCurrentLineIdx = 0;
  1531. render_buf.nLines++;
  1532. memset(render_buf_sLines[render_buf.nLines-1], 0, 256);
  1533. }
  1534. render_buf_sLines[render_buf.nLines-1][render_buf.nCurrentLineIdx++] = (char)ESC_0_SPACE_LINE;
  1535. render_buf_sLines[render_buf.nLines-1][render_buf.nCurrentLineIdx++] = 12;
  1536. render_buf_sLines[render_buf.nLines-1][render_buf.nCurrentLineIdx] = '\0';
  1537. render_buf.current_y += 12;
  1538. render_buf.current_x = -1;
  1539. render_buf.nCurrentLineIdx = 0;
  1540. render_buf.nLines++;
  1541. memset(render_buf_sLines[render_buf.nLines-1], 0, 256);
  1542. }
  1543. int all_blanks(char *p, int len)
  1544. {
  1545. int rc = 1;
  1546. while (rc && len > 0)
  1547. {
  1548. if (*p == ' ' || *p == 0x0A || *p == 0x0D || *p == '\t')
  1549. {
  1550. p++;
  1551. len--;
  1552. }
  1553. else
  1554. rc = 0;
  1555. }
  1556. return rc;
  1557. }
  1558. extern MYSQL *g_conn;
  1559. void get_redirect_title(char *sTitle)
  1560. {
  1561. char sSQL[MAX_SQL_STR];
  1562. MYSQL_RES *res;
  1563. MYSQL_ROW row;
  1564. int rc;
  1565. sprintf(sSQL, "select redirect_title from entries where title='%s' and entry_type=1", sTitle);
  1566. sTitle[0] = '\0';
  1567. if (!(rc = mysql_query(g_conn, sSQL)))
  1568. {
  1569. res = mysql_use_result(g_conn);
  1570. if ((row = mysql_fetch_row(res)) != NULL)
  1571. {
  1572. strcpy(sTitle, row[0]);
  1573. }
  1574. mysql_free_result(res);
  1575. }
  1576. mysql_commit(g_conn);
  1577. }
  1578. void render_link(int idxNode)
  1579. {
  1580. char sClass[MAX_CLASS_STRING];
  1581. char sRef[MAX_FOLDER_NAME];
  1582. char sLink[MAX_LINK_STRING];
  1583. int i;
  1584. sLink[0] = '\0';
  1585. get_key_value(sRef, MAX_FOLDER_NAME, "href", article_nodes[idxNode].pTagDesc, article_nodes[idxNode].lenTagDesc);
  1586. get_key_value(sClass, MAX_CLASS_STRING, "class", article_nodes[idxNode].pTagDesc, article_nodes[idxNode].lenTagDesc);
  1587. get_key_value(sLink, MAX_LINK_STRING, "title",article_nodes[idxNode].pTagDesc, article_nodes[idxNode].lenTagDesc);
  1588. // for (i=0; i < strlen(sLink); i++)
  1589. // {
  1590. // if (sLink[i] == ':')
  1591. // sLink[i] = '~';
  1592. // else if (sLink[i] == ' ')
  1593. // sLink[i] = '_';
  1594. // }
  1595. if (!strcmp(sClass, "mw-redirect"))
  1596. {
  1597. get_redirect_title(sLink);
  1598. }
  1599. else if (!sLink[0] && !strncmp(sRef, "../../../../articles/", 21))
  1600. {
  1601. get_file_name_from_path(sLink, sRef);
  1602. }
  1603. else if (!strcmp(sClass, "external autonumber") || !strncmp(sRef, "#cite_note", 10) || !strncmp(sRef, "internal", 8))
  1604. return; // skip external reference or citation
  1605. if (article_nodes[idxNode].len > 0 && strncmp(article_nodes[idxNode].pContent, "File:", 5) && strncmp(article_nodes[idxNode].pContent, "http:", 5))
  1606. {
  1607. if (!strncmp(sLink, "http:", 5) || unsupported_article(sLink))
  1608. sLink[0] = '\0';
  1609. if (sLink[0])
  1610. {
  1611. render_string(article_nodes[idxNode].pContent, article_nodes[idxNode].len, -1, RENDER_TYPE_LINK, sLink);
  1612. }
  1613. // else
  1614. // {
  1615. // render_string(article_nodes[idxNode].pContent, article_nodes[idxNode].len, -1, RENDER_TYPE_NORMAL, NULL);
  1616. // }
  1617. }
  1618. }
  1619. void render_article_node(int idxNode)
  1620. {
  1621. char keyval[256];
  1622. showMsg(3, "render idxNode %d, idxTag %d, len %d, current_y %ld\n", idxNode, article_nodes[idxNode].idxTag, article_nodes[idxNode].len, render_buf.current_y);
  1623. switch (article_nodes[idxNode].idxTag)
  1624. {
  1625. case TAG_PAIR_P:
  1626. get_key_value(keyval, sizeof(keyval), "class", article_nodes[idxNode].pTagDesc, article_nodes[idxNode].lenTagDesc);
  1627. // skip if for error message
  1628. if (strcmp(keyval, "error"))
  1629. {
  1630. //render_line_break(idxNode);
  1631. render_newline(12);
  1632. render_text_node(idxNode, DEFAULT_FONT_IDX, RENDER_TYPE_NORMAL);
  1633. }
  1634. break;
  1635. case TAG_PAIR_BR:
  1636. case TAG_PAIR_BR2:
  1637. case TAG_PAIR_P_END:
  1638. render_line_break();
  1639. break;
  1640. case TAG_PAIR_SPAN:
  1641. get_key_value(keyval, sizeof(keyval), "class", article_nodes[idxNode].pTagDesc, article_nodes[idxNode].lenTagDesc);
  1642. // skip if for [edit]
  1643. if (strcmp(keyval, "editsection"))
  1644. {
  1645. render_text_node(idxNode, -1, -1);
  1646. }
  1647. break;
  1648. case TAG_PAIR_H1:
  1649. case TAG_PAIR_H2:
  1650. case TAG_PAIR_H3:
  1651. case TAG_PAIR_H4:
  1652. case TAG_PAIR_H5:
  1653. case TAG_PAIR_H6:
  1654. render_sub_title_node(idxNode);
  1655. break;
  1656. case TAG_PAIR_A:
  1657. render_link(idxNode);
  1658. break;
  1659. case TAG_PAIR_BIG_BIG:
  1660. case TAG_PAIR_BIG:
  1661. render_node_with_font(idxNode, DEFAULT_FONT_IDX);
  1662. break;
  1663. case TAG_PAIR_BI:
  1664. render_node_with_font(idxNode, ITALIC_FONT_IDX);
  1665. break;
  1666. case TAG_PAIR_B:
  1667. render_node_with_font(idxNode, DEFAULT_FONT_IDX);
  1668. break;
  1669. case TAG_PAIR_I:
  1670. render_node_with_font(idxNode, ITALIC_FONT_IDX);
  1671. break;
  1672. case TAG_PAIR_TABLE:
  1673. get_key_value(keyval, sizeof(keyval), "class", article_nodes[idxNode].pTagDesc, article_nodes[idxNode].lenTagDesc);
  1674. // skip if for table of contents
  1675. if (strcmp(keyval, "toc"))
  1676. {
  1677. render_table(idxNode);
  1678. }
  1679. break;
  1680. case TAG_PAIR_TR:
  1681. render_TR(idxNode);
  1682. break;
  1683. case TAG_PAIR_TH:
  1684. case TAG_PAIR_TD:
  1685. render_TD(idxNode);
  1686. break;
  1687. case TAG_PAIR_OL:
  1688. render_newline(2);
  1689. if (render_buf.list_depth < MAX_LIST_DEPTH - 1)
  1690. {
  1691. render_buf.list_type[render_buf.list_depth] = 'O';
  1692. render_buf.ol_count[render_buf.list_depth] = 0;
  1693. render_buf.list_depth++;
  1694. }
  1695. break;
  1696. case TAG_PAIR_UL:
  1697. render_newline(2);
  1698. if (render_buf.list_depth < MAX_LIST_DEPTH - 1)
  1699. {
  1700. render_buf.list_type[render_buf.list_depth] = 'U';
  1701. render_buf.ol_count[render_buf.list_depth] = 0;
  1702. render_buf.list_depth++;
  1703. }
  1704. break;
  1705. case TAG_PAIR_DL:
  1706. render_newline(2);
  1707. if (render_buf.list_depth < MAX_LIST_DEPTH - 1)
  1708. {
  1709. render_buf.list_type[render_buf.list_depth] = 'D';
  1710. render_buf.ol_count[render_buf.list_depth] = 0;
  1711. render_buf.list_depth++;
  1712. }
  1713. break;
  1714. case TAG_PAIR_EOL:
  1715. case TAG_PAIR_EUL:
  1716. case TAG_PAIR_EDL:
  1717. render_newline(2);
  1718. if (render_buf.list_depth > 0)
  1719. {
  1720. render_buf.list_depth--;
  1721. render_buf.current_indent = 0;
  1722. }
  1723. break;
  1724. case TAG_PAIR_LI:
  1725. get_key_value(keyval, sizeof(keyval), "class", article_nodes[idxNode].pTagDesc, article_nodes[idxNode].lenTagDesc);
  1726. // skip if for for contents
  1727. if (strncmp(keyval, "toclevel-", 9))
  1728. {
  1729. render_LI(idxNode);
  1730. }
  1731. break;
  1732. case TAG_PAIR_DT:
  1733. render_DT(idxNode);
  1734. break;
  1735. case TAG_PAIR_DD:
  1736. render_DD(idxNode);
  1737. break;
  1738. case TAG_PAIR_SCRIPT:
  1739. case TAG_PAIR_REF:
  1740. break;
  1741. default:
  1742. render_text_node(idxNode, DEFAULT_FONT_IDX, RENDER_TYPE_NORMAL);
  1743. break;
  1744. }
  1745. if (article_nodes[idxNode].idxNextNode)
  1746. render_article_node(article_nodes[idxNode].idxNextNode);
  1747. }
  1748. void render_wiki_text(char *pText, long lenText)
  1749. {
  1750. int idxChildTag;
  1751. int idxMyNode;
  1752. int idxChildNode;
  1753. int idxPreviousChild;
  1754. int lenBeforeTag;
  1755. char *pTagDesc;
  1756. int lenTagDesc;
  1757. char *pTagContent;
  1758. int lenTagContent;
  1759. char *pAfterTag;
  1760. int lenAfterTag;
  1761. idxMyNode = nArticleNodeCount++;
  1762. article_nodes[idxMyNode].idxTag = -1;
  1763. article_nodes[idxMyNode].pTagDesc = NULL;
  1764. article_nodes[idxMyNode].lenTagDesc = 0;
  1765. article_nodes[idxMyNode].pContent = pText;
  1766. article_nodes[idxMyNode].len = 0;
  1767. article_nodes[idxMyNode].idxChildNode = 0;
  1768. article_nodes[idxMyNode].idxNextNode = 0;
  1769. idxPreviousChild = 0;
  1770. idxChildTag = find_next_tag(pText, lenText, &lenBeforeTag, &pTagDesc, &lenTagDesc, &pTagContent, &lenTagContent, &pAfterTag, &lenAfterTag);
  1771. article_nodes[idxMyNode].len = lenBeforeTag;
  1772. while (idxChildTag >= 0)
  1773. {
  1774. idxChildNode = build_child_tree(idxPreviousChild, idxChildTag, pTagDesc, lenTagDesc, pTagContent, lenTagContent);
  1775. if (!article_nodes[idxMyNode].idxChildNode)
  1776. {
  1777. article_nodes[idxMyNode].idxChildNode = idxChildNode;
  1778. }
  1779. idxPreviousChild = idxChildNode;
  1780. pText = pAfterTag;
  1781. lenText = lenAfterTag;
  1782. idxChildTag = find_next_tag(pText, lenText, &lenBeforeTag, &pTagDesc, &lenTagDesc, &pTagContent, &lenTagContent, &pAfterTag, &lenAfterTag);
  1783. if (lenBeforeTag > 0)
  1784. {
  1785. article_nodes[nArticleNodeCount].idxTag = -1;
  1786. article_nodes[nArticleNodeCount].pTagDesc = NULL;
  1787. article_nodes[nArticleNodeCount].lenTagDesc = 0;
  1788. article_nodes[nArticleNodeCount].pContent = pText;
  1789. article_nodes[nArticleNodeCount].len = lenBeforeTag;
  1790. article_nodes[nArticleNodeCount].idxChildNode = 0;
  1791. article_nodes[nArticleNodeCount].idxNextNode = 0;
  1792. if (idxPreviousChild > 0)
  1793. {
  1794. article_nodes[idxPreviousChild].idxNextNode = nArticleNodeCount;
  1795. }
  1796. else
  1797. article_nodes[idxMyNode].idxNextNode = nArticleNodeCount;
  1798. idxPreviousChild = nArticleNodeCount;
  1799. nArticleNodeCount++;
  1800. }
  1801. }
  1802. if (msgLevel() >= 4)
  1803. {
  1804. // int i;
  1805. // for (i=0; i< nArticleNodeCount; i++)
  1806. // dump_article_node(i, 0);
  1807. dump_article_node_and_children(0, 0);
  1808. }
  1809. render_article_node(0);
  1810. }
  1811. void url_decode(char *src)
  1812. {
  1813. char dst[MAX_TITLE_LEN];
  1814. int dst_len = 0;
  1815. if (!*src)
  1816. return;
  1817. int i = 0;
  1818. int length = strlen(src);
  1819. while ( i < length && dst_len < MAX_TITLE_LEN - 1)
  1820. {
  1821. unsigned int c = src[i++];
  1822. if ( c=='%' && (i+1<length) )
  1823. {
  1824. int number = 0;
  1825. unsigned char digit = (unsigned char) src[i++];
  1826. digit = toupper(digit);
  1827. if ( digit<='9' )
  1828. digit -= 48;
  1829. else
  1830. digit -= 55;
  1831. number = digit;
  1832. digit = (unsigned char) src[i++];
  1833. digit = toupper(digit);
  1834. if ( digit<='9' )
  1835. digit -= 48;
  1836. else
  1837. digit -= 55;
  1838. number = number*16 + digit;
  1839. dst[dst_len++] = (char)number;
  1840. }
  1841. else
  1842. dst[dst_len++] = c;
  1843. }
  1844. dst[dst_len] = '\0';
  1845. strcpy(src, dst);
  1846. return;
  1847. }
  1848. // size is the max size of input buffer (for expansion), not the length of the input string
  1849. void pad_backslash(char *s, int size)
  1850. {
  1851. int i;
  1852. i = 0;
  1853. while (i < size - 2 && s[i])
  1854. {
  1855. if (s[i] == '\'' || s[i] == '\\')
  1856. {
  1857. memrcpy(&s[i+1], &s[i], size - i - 2);
  1858. s[i] = '\\';
  1859. i++;
  1860. }
  1861. i++;
  1862. }
  1863. s[size - 1] = '\0';
  1864. }
  1865. long GetArticleIdxByTitle(MYSQL *conn, char *sTitle)
  1866. {
  1867. char sPaddedTitle[MAX_TITLE_LEN];
  1868. char sSQL[MAX_SQL_STR];
  1869. MYSQL_RES *res;
  1870. MYSQL_ROW row;
  1871. int rc;
  1872. long idxArticle = -1;
  1873. strcpy(sPaddedTitle, sTitle);
  1874. pad_backslash(sPaddedTitle, MAX_TITLE_LEN);
  1875. sprintf(sSQL, "select idx from entries where title='%s' and entry_type=0", sPaddedTitle);
  1876. if (!(rc = mysql_query(conn, sSQL)))
  1877. {
  1878. res = mysql_use_result(conn);
  1879. if ((row = mysql_fetch_row(res)) != NULL)
  1880. {
  1881. idxArticle = atol(row[0]);
  1882. }
  1883. mysql_free_result(res);
  1884. }
  1885. return idxArticle;
  1886. }
  1887. void trim_body_text(char **pText, int *len)
  1888. {
  1889. char *p;
  1890. p = strstr(*pText, "<!-- start content -->");
  1891. if (p)
  1892. {
  1893. p += 22;
  1894. *len -= p - *pText;
  1895. *pText = p;
  1896. }
  1897. p = strstr(*pText, "<div class=\"printfooter\">");
  1898. if (p)
  1899. *len = p - *pText;
  1900. }
  1901. void trim_blanks(char *pSrc, int lenSrc)
  1902. {
  1903. int lenLeadingBlanks = 0;
  1904. int lenTrailingBlanks = 0;
  1905. while (lenLeadingBlanks < lenSrc && pSrc[lenLeadingBlanks] == ' ')
  1906. lenLeadingBlanks++;
  1907. if (lenLeadingBlanks < lenSrc)
  1908. while (lenTrailingBlanks < lenSrc && pSrc[lenSrc - lenTrailingBlanks - 1] == ' ')
  1909. lenTrailingBlanks++;
  1910. if (lenLeadingBlanks > 0)
  1911. memcpy(pSrc, &pSrc[lenLeadingBlanks], lenSrc - lenLeadingBlanks - lenTrailingBlanks);
  1912. if (lenLeadingBlanks > 0 || lenTrailingBlanks > 0)
  1913. pSrc[lenSrc - lenLeadingBlanks - lenTrailingBlanks] = '\0';
  1914. }
  1915. long cpy_wiki_buf(char *sBuf, long lenBuf, long maxLenBuf, char *pTagContent, int lenTagContent)
  1916. {
  1917. int copy_len;
  1918. if (lenBuf + lenTagContent > maxLenBuf - 1)
  1919. {
  1920. showMsg(0, "cpy_wiki_buf %d > %d\n", lenBuf + lenTagContent, maxLenBuf);
  1921. copy_len = maxLenBuf - 1 - lenBuf;
  1922. }
  1923. else
  1924. copy_len = lenTagContent;
  1925. memcpy(&sBuf[lenBuf], pTagContent, copy_len);
  1926. return lenBuf + copy_len;
  1927. }
  1928. static char sListStack[MAX_LIST_DEPTH];
  1929. static int nListStack;
  1930. static char sListStackPrev[MAX_LIST_DEPTH];
  1931. static int nListStackPrev;
  1932. long process_list_stack(char *sListStack, int nListStack, char *sListStackPrev, int nListStackPrev, char *sBuf, long lenBuf, long maxLenBuf)
  1933. {
  1934. int i;
  1935. int nListSame = 0;
  1936. i = 0;
  1937. while (i < nListStack && i < nListStackPrev && sListStack[i] == sListStackPrev[i])
  1938. {
  1939. nListSame++;
  1940. i++;
  1941. }
  1942. if (nListStackPrev > nListSame)
  1943. {
  1944. for (i = nListStackPrev - 1; i >= nListSame; i--)
  1945. {
  1946. switch (sListStackPrev[i])
  1947. {
  1948. case 'O':
  1949. lenBuf = cpy_wiki_buf(sBuf, lenBuf, maxLenBuf, "</ol>", 5);
  1950. break;
  1951. case 'U':
  1952. lenBuf = cpy_wiki_buf(sBuf, lenBuf, maxLenBuf, "</ul>", 5);
  1953. break;
  1954. case 'D':
  1955. lenBuf = cpy_wiki_buf(sBuf, lenBuf, maxLenBuf, "</dl>", 5);
  1956. break;
  1957. default:
  1958. break;
  1959. }
  1960. }
  1961. }
  1962. if (nListStack > nListSame)
  1963. {
  1964. for (i = nListSame; i < nListStack; i++)
  1965. {
  1966. switch (sListStack[i])
  1967. {
  1968. case 'O':
  1969. lenBuf = cpy_wiki_buf(sBuf, lenBuf, maxLenBuf, "<ol>", 4);
  1970. break;
  1971. case 'U':
  1972. lenBuf = cpy_wiki_buf(sBuf, lenBuf, maxLenBuf, "<ul>", 4);
  1973. break;
  1974. case 'D':
  1975. lenBuf = cpy_wiki_buf(sBuf, lenBuf, maxLenBuf, "<dl>", 4);
  1976. break;
  1977. default:
  1978. break;
  1979. }
  1980. }
  1981. }
  1982. return lenBuf;
  1983. }
  1984. long process_wiki_table(int idxStart, int idxEnd, char *sText, char *sBuf, long lenBuf, long maxLenBuf)
  1985. {
  1986. int bIdxIncreased = 0;
  1987. int idxEndLocal;
  1988. int idxEndRow;
  1989. char sCellText[MAX_CELL_STRING];
  1990. char *pRow;
  1991. int lenRow;
  1992. int bBrBeforeRow = 0;
  1993. int bBlankBeforeCell = 0;
  1994. lenBuf = cpy_wiki_buf(sBuf, lenBuf, maxLenBuf, "<br>", 4);
  1995. idxStart++;
  1996. if (idxStart <= idxEnd && wiki_nodes[idxStart].idxTag == WIKI_TAG_PAIR_TEXT)
  1997. idxStart++; // skip the table attribute
  1998. while (idxStart <= idxEnd)
  1999. {
  2000. bIdxIncreased = 0;
  2001. switch(wiki_nodes[idxStart].idxTag)
  2002. {
  2003. case WIKI_TAG_PAIR_TABLE_CAPTION:
  2004. if (wiki_nodes[idxStart].bTagStart == 1 && idxStart < idxEnd && wiki_nodes[idxStart + 1].idxTag == WIKI_TAG_PAIR_TEXT)
  2005. {
  2006. pRow = wiki_nodes[idxStart + 1].pTag;
  2007. lenRow = wiki_nodes[idxStart + 1].lenTag;
  2008. get_next_cell(&pRow, &lenRow, NULL, sCellText);
  2009. lenBuf = cpy_wiki_buf(sBuf, lenBuf, maxLenBuf, sCellText, strlen(sCellText));
  2010. idxStart += 2;
  2011. bIdxIncreased = 1;
  2012. lenBuf = cpy_wiki_buf(sBuf, lenBuf, maxLenBuf, "<br>", 4);
  2013. }
  2014. break;
  2015. case WIKI_TAG_PAIR_TABLE_ROW:
  2016. if (bBrBeforeRow) {
  2017. lenBuf = cpy_wiki_buf(sBuf, lenBuf, maxLenBuf, "<br>", 4);
  2018. bBrBeforeRow = 0;
  2019. }
  2020. idxStart = find_wiki_tag_end(idxStart, idxEnd);
  2021. bBlankBeforeCell = 0;
  2022. break;
  2023. case WIKI_TAG_PAIR_TABLE_CELLS:
  2024. if (wiki_nodes[idxStart].bTagStart == 1)
  2025. {
  2026. bBrBeforeRow = 1;
  2027. bBlankBeforeCell = 0;
  2028. idxEndRow = find_wiki_tag_end(idxStart, idxEnd);
  2029. idxStart++;
  2030. while (idxStart < idxEndRow)
  2031. {
  2032. if (wiki_nodes[idxStart].idxTag == WIKI_TAG_PAIR_TEXT)
  2033. {
  2034. pRow = wiki_nodes[idxStart].pTag;
  2035. lenRow = wiki_nodes[idxStart].lenTag;
  2036. while (get_next_cell(&pRow, &lenRow, "||", sCellText))
  2037. {
  2038. if (bBlankBeforeCell)
  2039. {
  2040. lenBuf = cpy_wiki_buf(sBuf, lenBuf, maxLenBuf, " ", 1);
  2041. }
  2042. lenBuf = cpy_wiki_buf(sBuf, lenBuf, maxLenBuf, sCellText, strlen(sCellText));
  2043. bBlankBeforeCell = 1;
  2044. }
  2045. }
  2046. else
  2047. {
  2048. idxEndLocal = find_wiki_tag_end(idxStart, idxEndRow);
  2049. lenBuf = process_wiki_tag(&idxStart, idxEndLocal, sText, sBuf, lenBuf, maxLenBuf, 0);
  2050. idxStart = idxEndLocal;
  2051. }
  2052. idxStart++;
  2053. }
  2054. bIdxIncreased = 1;
  2055. }
  2056. break;
  2057. case WIKI_TAG_PAIR_TABLE_HEADER:
  2058. if (wiki_nodes[idxStart].bTagStart == 1)
  2059. {
  2060. bBlankBeforeCell = 0;
  2061. idxEndRow = find_wiki_tag_end(idxStart, idxEnd);
  2062. idxStart++;
  2063. while (idxStart < idxEndRow)
  2064. {
  2065. if (wiki_nodes[idxStart].idxTag == WIKI_TAG_PAIR_TEXT)
  2066. {
  2067. pRow = wiki_nodes[idxStart].pTag;
  2068. lenRow = wiki_nodes[idxStart].lenTag;
  2069. while (get_next_cell(&pRow, &lenRow, "!!", sCellText))
  2070. {
  2071. if (bBlankBeforeCell)
  2072. {
  2073. lenBuf = cpy_wiki_buf(sBuf, lenBuf, maxLenBuf, " ", 1);
  2074. }
  2075. lenBuf = cpy_wiki_buf(sBuf, lenBuf, maxLenBuf, "<b>", 3);
  2076. lenBuf = cpy_wiki_buf(sBuf, lenBuf, maxLenBuf, sCellText, strlen(sCellText));
  2077. lenBuf = cpy_wiki_buf(sBuf, lenBuf, maxLenBuf, "</b>", 4);
  2078. bBlankBeforeCell = 1;
  2079. }
  2080. }
  2081. else
  2082. {
  2083. idxEndLocal = find_wiki_tag_end(idxStart, idxEndRow);
  2084. lenBuf = process_wiki_tag(&idxStart, idxEndLocal, sText, sBuf, lenBuf, maxLenBuf, 0);
  2085. idxStart = idxEndLocal;
  2086. }
  2087. idxStart++;
  2088. }
  2089. bIdxIncreased = 1;
  2090. }
  2091. break;
  2092. default:
  2093. idxEndLocal = find_wiki_tag_end(idxStart, idxEnd);
  2094. lenBuf = process_wiki_tag(&idxStart, idxEndLocal, sText, sBuf, lenBuf, maxLenBuf, 0);
  2095. idxStart = idxEndLocal + 1;
  2096. bIdxIncreased = 1;
  2097. break;
  2098. }
  2099. if (!bIdxIncreased)
  2100. idxStart++;
  2101. }
  2102. lenBuf = cpy_wiki_buf(sBuf, lenBuf, maxLenBuf, "</table>", 8);
  2103. return lenBuf;
  2104. }
  2105. int unsupported_section(char *pTagContent, int lenTagContent)
  2106. {
  2107. while (*pTagContent == ' ') // trim leading blanks
  2108. {
  2109. pTagContent++;
  2110. lenTagContent--;
  2111. }
  2112. while (lenTagContent > 0 && pTagContent[lenTagContent - 1] == ' ') // trim trailing blanks
  2113. {
  2114. lenTagContent--;
  2115. }
  2116. if (!memcmp(pTagContent, "External links", lenTagContent) ||
  2117. !memcmp(pTagContent, "References", lenTagContent) ||
  2118. !memcmp(pTagContent, "See also", lenTagContent) ||
  2119. !memcmp(pTagContent, "Further reading", lenTagContent) ||
  2120. !memcmp(pTagContent, "Footnotes and References", lenTagContent) ||
  2121. !memcmp(pTagContent, "Notes", lenTagContent) ||
  2122. !memcmp(pTagContent, "Gallery", lenTagContent) ||
  2123. !memcmp(pTagContent, "Notes and references", lenTagContent))
  2124. return 1;
  2125. else
  2126. return 0;
  2127. }
  2128. long process_wiki_h2(int *idxNode, int maxNode, char *sBuf, long lenBuf, long maxLenBuf)
  2129. {
  2130. int idxNodeContent = *idxNode + 1;
  2131. if (idxNodeContent < maxNode)
  2132. {
  2133. if (unsupported_section(wiki_nodes[idxNodeContent].pTag, wiki_nodes[idxNodeContent].lenTag))
  2134. {
  2135. while (idxNodeContent < maxNode && (wiki_nodes[idxNodeContent].idxTag != WIKI_TAG_PAIR_H2 ||
  2136. wiki_nodes[idxNodeContent].bTagStart== 0))
  2137. idxNodeContent++;
  2138. }
  2139. else
  2140. {
  2141. if (lenBuf >= 4 && !memcmp(sBuf - 4, "<br>", 4))
  2142. {
  2143. if (lenBuf >= 8 && !memcmp(sBuf - 8, "<br>", 4))
  2144. lenBuf = cpy_wiki_buf(sBuf, lenBuf, maxLenBuf, "<h2>", 4);
  2145. else
  2146. lenBuf = cpy_wiki_buf(sBuf, lenBuf, maxLenBuf, "<br><h2>", 8);
  2147. }
  2148. else
  2149. {
  2150. lenBuf = cpy_wiki_buf(sBuf, lenBuf, maxLenBuf, "<br><br><h2>", 12);
  2151. }
  2152. }
  2153. *idxNode = idxNodeContent;
  2154. }
  2155. else
  2156. (*idxNode)++;
  2157. return lenBuf;
  2158. }
  2159. int parse_link_string(char *pTagContent, int lenTagContent, char *sLink, char *sLinkDisplay)
  2160. {
  2161. char *p;
  2162. int len;
  2163. if (unsupported_article(pTagContent))
  2164. {
  2165. return -1;
  2166. }
  2167. else
  2168. {
  2169. p = strnchr(pTagContent, '|', lenTagContent);
  2170. if (p)
  2171. {
  2172. len = (p - pTagContent);
  2173. if (len < 0)
  2174. len = 0;
  2175. else if (len >= MAX_LINK_STRING)
  2176. len = MAX_LINK_STRING - 1;
  2177. memcpy(sLink, pTagContent, len);
  2178. sLink[len] = '\0';
  2179. len = lenTagContent - len - 1;
  2180. if (len <= 0)
  2181. {
  2182. strcpy(sLinkDisplay, sLink);
  2183. }
  2184. else
  2185. {
  2186. if (len >= MAX_LINK_STRING)
  2187. len = MAX_LINK_STRING - 1;
  2188. memcpy(sLinkDisplay, p + 1, len);
  2189. sLinkDisplay[len] = '\0';
  2190. }
  2191. }
  2192. else
  2193. {
  2194. if (lenTagContent >= MAX_LINK_STRING)
  2195. lenTagContent = MAX_LINK_STRING - 1;
  2196. strncpy(sLink, pTagContent, lenTagContent);
  2197. sLink[lenTagContent] = '\0';
  2198. strcpy(sLinkDisplay, sLink);
  2199. }
  2200. return 0;
  2201. }
  2202. }
  2203. void parse_external_link_string(char *pTagContent, int lenTagContent, char *sLinkDisplay)
  2204. {
  2205. char *p;
  2206. int len;
  2207. p = strnchr(pTagContent, ' ', lenTagContent);
  2208. if (p)
  2209. {
  2210. len = lenTagContent - (p - pTagContent + 1);
  2211. if (len < 0)
  2212. len = 0;
  2213. else if (len >= MAX_LINK_STRING)
  2214. len = MAX_LINK_STRING - 1;
  2215. memcpy(sLinkDisplay, p + 1, len);
  2216. sLinkDisplay[len] = '\0';
  2217. }
  2218. else
  2219. {
  2220. if (lenTagContent >= MAX_LINK_STRING)
  2221. lenTagContent = MAX_LINK_STRING - 1;
  2222. memcpy(sLinkDisplay, pTagContent, lenTagContent);
  2223. sLinkDisplay[lenTagContent] = '\0';
  2224. }
  2225. if (!strncmp(sLinkDisplay, "http:", 5))
  2226. sLinkDisplay[0] = '\0';
  2227. }
  2228. long process_wiki_list(int idxStart, int idxEnd, char *sText, char *sBuf, long lenBuf, long maxLenBuf)
  2229. {
  2230. int lenBufBase;
  2231. char lastListType;
  2232. int i;
  2233. nListStack = wiki_nodes[idxStart].lenTag;
  2234. if (nListStack > MAX_LIST_DEPTH - 1)
  2235. nListStack = MAX_LIST_DEPTH - 1;
  2236. for (i = 0; i < nListStack; i++)
  2237. {
  2238. if (wiki_nodes[idxStart].pTag[i] == '#')
  2239. sListStack[i] = 'O';
  2240. else if (wiki_nodes[idxStart].pTag[i] == '*')
  2241. sListStack[i] = 'U';
  2242. else
  2243. sListStack[i] = 'D';
  2244. }
  2245. lenBuf = process_list_stack(sListStack, nListStack, sListStackPrev, nListStackPrev, sBuf, lenBuf, maxLenBuf);
  2246. if (idxStart <= idxEnd - 2)
  2247. {
  2248. lastListType = wiki_nodes[idxStart].pTag[wiki_nodes[idxStart].lenTag - 1];
  2249. if (lastListType == '*' || lastListType == '#')
  2250. {
  2251. lenBuf = cpy_wiki_buf(sBuf, lenBuf, maxLenBuf, "<li>", 4);
  2252. }
  2253. else if (lastListType == ';')
  2254. {
  2255. lenBuf = cpy_wiki_buf(sBuf, lenBuf, maxLenBuf, "<dt>", 4);
  2256. }
  2257. else if (lastListType == ':')
  2258. {
  2259. lenBuf = cpy_wiki_buf(sBuf, lenBuf, maxLenBuf, "<dd>", 4);
  2260. }
  2261. lenBufBase = lenBuf;
  2262. idxStart++;
  2263. idxEnd--;
  2264. lenBuf = process_wiki_tag(&idxStart, idxEnd, sText, sBuf, lenBuf, maxLenBuf, 1);
  2265. if (lenBuf == lenBufBase) // empty list
  2266. lenBuf -= 4;
  2267. else
  2268. {
  2269. if (lastListType == '*' || lastListType == '#')
  2270. {
  2271. lenBuf = cpy_wiki_buf(sBuf, lenBuf, maxLenBuf, "</li>", 5);
  2272. }
  2273. else if (lastListType == ';')
  2274. {
  2275. lenBuf = cpy_wiki_buf(sBuf, lenBuf, maxLenBuf, "</dt>", 5);
  2276. }
  2277. else if (lastListType == ':')
  2278. {
  2279. lenBuf = cpy_wiki_buf(sBuf, lenBuf, maxLenBuf, "</dd>", 5);
  2280. }
  2281. }
  2282. }
  2283. memcpy(sListStackPrev, sListStack, nListStack);
  2284. nListStackPrev = nListStack;
  2285. return lenBuf;
  2286. }
  2287. long process_wiki_link(int idxStart, int idxEnd, char *sText, char *sBuf, long lenBuf, long maxLenBuf)
  2288. {
  2289. char sLink[MAX_LINK_STRING];
  2290. char sLinkDisplay[MAX_LINK_STRING];
  2291. idxStart++;
  2292. if (wiki_nodes[idxStart].idxTag == WIKI_TAG_PAIR_TEXT &&
  2293. !parse_link_string(wiki_nodes[idxStart].pTag, wiki_nodes[idxStart].lenTag, sLink, sLinkDisplay))
  2294. {
  2295. lenBuf = cpy_wiki_buf(sBuf, lenBuf, maxLenBuf, "<a title=\"", 10);
  2296. lenBuf = cpy_wiki_buf(sBuf, lenBuf, maxLenBuf, sLink, strlen(sLink));
  2297. lenBuf = cpy_wiki_buf(sBuf, lenBuf, maxLenBuf, "\">", 2);
  2298. lenBuf = cpy_wiki_buf(sBuf, lenBuf, maxLenBuf, sLinkDisplay, strlen(sLinkDisplay));
  2299. lenBuf = cpy_wiki_buf(sBuf, lenBuf, maxLenBuf, "</a>", 4);
  2300. }
  2301. return lenBuf;
  2302. }
  2303. long process_wiki_external_link(int idxStart, int idxEnd, char *sText, char *sBuf, long lenBuf, long maxLenBuf)
  2304. {
  2305. char sLink[MAX_LINK_STRING];
  2306. char sLinkDisplay[MAX_LINK_STRING];
  2307. idxStart++;
  2308. if (wiki_nodes[idxStart].idxTag == WIKI_TAG_PAIR_TEXT)
  2309. {
  2310. parse_external_link_string(wiki_nodes[idxStart].pTag, wiki_nodes[idxStart].lenTag, sLinkDisplay);
  2311. lenBuf = cpy_wiki_buf(sBuf, lenBuf, maxLenBuf, sLinkDisplay, strlen(sLinkDisplay));
  2312. }
  2313. return lenBuf;
  2314. }
  2315. int find_wiki_tag_end(int idxStartNode, int maxIdxTag)
  2316. {
  2317. int wiki_tag_stack[MAX_WIKI_TAG_STACK];
  2318. int nWikiTagStack = 0;
  2319. int idxEndNode;
  2320. if (wiki_nodes[idxStartNode].idxTag == WIKI_TAG_PAIR_TEXT ||
  2321. wiki_tag_pairs[wiki_nodes[idxStartNode].idxTag].lenTagEnd <= 0 ||
  2322. !wiki_nodes[idxStartNode].bTagStart)
  2323. idxEndNode = idxStartNode;
  2324. else
  2325. {
  2326. idxEndNode = idxStartNode + 1;
  2327. while (idxEndNode < maxIdxTag &&
  2328. (wiki_nodes[idxEndNode].idxTag != wiki_nodes[idxStartNode].idxTag || wiki_nodes[idxEndNode].bTagStart == 1 ||
  2329. nWikiTagStack > 0))
  2330. {
  2331. if (idxEndNode >= 0 && wiki_nodes[idxEndNode].idxTag != WIKI_TAG_PAIR_TEXT &&
  2332. wiki_tag_pairs[wiki_nodes[idxEndNode].idxTag].lenTagEnd > 0)
  2333. {
  2334. if (wiki_nodes[idxEndNode].bTagStart == 1)
  2335. {
  2336. if (nWikiTagStack < MAX_WIKI_TAG_STACK)
  2337. wiki_tag_stack[nWikiTagStack++] = wiki_nodes[idxEndNode].idxTag;
  2338. }
  2339. else
  2340. {
  2341. if (wiki_nodes[idxEndNode].idxTag == wiki_tag_stack[nWikiTagStack - 1])
  2342. nWikiTagStack--;
  2343. }
  2344. }
  2345. idxEndNode++;
  2346. }
  2347. if (idxEndNode > maxIdxTag)
  2348. idxEndNode = maxIdxTag;
  2349. }
  2350. return idxEndNode;
  2351. }
  2352. long process_wiki_tag(int *idxNode, int maxNode, char *sText, char *sBuf, long lenBuf, long maxLenBuf, int bInList)
  2353. {
  2354. int lenLocalBuf = 0;
  2355. int idxStart = *idxNode;
  2356. int idxEnd;
  2357. int bIdxIncreased;
  2358. if (idxStart >=0)
  2359. idxEnd = find_wiki_tag_end(idxStart, maxNode);
  2360. else
  2361. idxEnd = idxStart;
  2362. while (idxStart <= idxEnd)
  2363. {
  2364. showMsg(5, "process_wiki_tag idxStart %d, idxEnd %d, tag %d\n", idxStart, idxEnd, wiki_nodes[idxStart].idxTag);
  2365. bIdxIncreased = 0;
  2366. if (!bInList)
  2367. switch (wiki_nodes[idxStart].idxTag)
  2368. {
  2369. case WIKI_TAG_PAIR_OL:
  2370. case WIKI_TAG_PAIR_UL:
  2371. case WIKI_TAG_PAIR_DT:
  2372. case WIKI_TAG_PAIR_DD:
  2373. break;
  2374. default:
  2375. nListStack = 0;
  2376. if (nListStackPrev > 0)
  2377. {
  2378. lenBuf = process_list_stack(sListStack, nListStack, sListStackPrev, nListStackPrev, sBuf, lenBuf, maxLenBuf);
  2379. nListStackPrev = 0;
  2380. }
  2381. break;
  2382. }
  2383. switch (wiki_nodes[idxStart].idxTag)
  2384. {
  2385. case WIKI_TAG_PAIR_SEP:
  2386. case WIKI_TAG_PAIR_COMMENT:
  2387. case WIKI_TAG_PAIR_TEMPLATE:
  2388. case WIKI_TAG_PAIR_REF:
  2389. idxStart = idxEnd + 1;
  2390. bIdxIncreased = 1;
  2391. break;
  2392. case WIKI_TAG_PAIR_TABLE:
  2393. if (wiki_nodes[idxStart].bTagStart)
  2394. {
  2395. lenBuf = process_wiki_table(idxStart, idxEnd, sText, sBuf, lenBuf, maxLenBuf);
  2396. idxStart = idxEnd + 1;
  2397. bIdxIncreased = 1;
  2398. }
  2399. break;
  2400. case WIKI_TAG_PAIR_LINK:
  2401. if (wiki_nodes[idxStart].bTagStart)
  2402. {
  2403. lenBuf = process_wiki_link(idxStart, idxEnd, sText, sBuf, lenBuf, maxLenBuf);
  2404. idxStart = idxEnd + 1;
  2405. bIdxIncreased = 1;
  2406. }
  2407. break;
  2408. case WIKI_TAG_PAIR_EXTERNAL_LINK:
  2409. if (wiki_nodes[idxStart].bTagStart)
  2410. {
  2411. lenBuf = process_wiki_external_link(idxStart, idxEnd, sText, sBuf, lenBuf, maxLenBuf);
  2412. idxStart = idxEnd + 1;
  2413. bIdxIncreased = 1;
  2414. }
  2415. break;
  2416. case WIKI_TAG_PAIR_H6:
  2417. if (wiki_nodes[idxStart].bTagStart == 1)
  2418. {
  2419. lenBuf = cpy_wiki_buf(sBuf, lenBuf, maxLenBuf, "<h6>", 4);
  2420. }
  2421. else
  2422. lenBuf = cpy_wiki_buf(sBuf, lenBuf, maxLenBuf, "</h6>", 5);
  2423. break;
  2424. case WIKI_TAG_PAIR_H5:
  2425. if (wiki_nodes[idxStart].bTagStart == 1)
  2426. {
  2427. lenBuf = cpy_wiki_buf(sBuf, lenBuf, maxLenBuf, "<h5>", 4);
  2428. }
  2429. else
  2430. lenBuf = cpy_wiki_buf(sBuf, lenBuf, maxLenBuf, "</h5>", 5);
  2431. break;
  2432. case WIKI_TAG_PAIR_H4:
  2433. if (wiki_nodes[idxStart].bTagStart == 1)
  2434. {
  2435. lenBuf = cpy_wiki_buf(sBuf, lenBuf, maxLenBuf, "<h4>", 4);
  2436. }
  2437. else
  2438. lenBuf = cpy_wiki_buf(sBuf, lenBuf, maxLenBuf, "</h4>", 5);
  2439. break;
  2440. case WIKI_TAG_PAIR_H3:
  2441. if (wiki_nodes[idxStart].bTagStart == 1)
  2442. {
  2443. if (lenBuf >= 8 && !memcmp(sBuf - 4, "<br>", 4))
  2444. lenBuf = cpy_wiki_buf(sBuf, lenBuf, maxLenBuf, "<h3>", 4);
  2445. else
  2446. lenBuf = cpy_wiki_buf(sBuf, lenBuf, maxLenBuf, "<br><h3>", 8);
  2447. }
  2448. else
  2449. lenBuf = cpy_wiki_buf(sBuf, lenBuf, maxLenBuf, "</h3><br><br>", 5);
  2450. break;
  2451. case WIKI_TAG_PAIR_H2:
  2452. if (wiki_nodes[idxStart].bTagStart == 1)
  2453. {
  2454. lenBuf = process_wiki_h2(&idxStart, maxNode, sBuf, lenBuf, MAX_LOCAL_TEXT_BUF);
  2455. bIdxIncreased = 1;
  2456. }
  2457. else
  2458. lenBuf = cpy_wiki_buf(sBuf, lenBuf, maxLenBuf, "</h2><br><br>", 5);
  2459. break;
  2460. case WIKI_TAG_PAIR_BOLD_ITALIC:
  2461. if (wiki_nodes[idxStart].bTagStart == 1)
  2462. lenBuf = cpy_wiki_buf(sBuf, lenBuf, maxLenBuf, "<b><i>", 6);
  2463. else
  2464. lenBuf = cpy_wiki_buf(sBuf, lenBuf, maxLenBuf, "</i></b>", 8);
  2465. break;
  2466. case WIKI_TAG_PAIR_BOLD:
  2467. if (wiki_nodes[idxStart].bTagStart == 1)
  2468. lenBuf = cpy_wiki_buf(sBuf, lenBuf, maxLenBuf, "<b>", 3);
  2469. else
  2470. lenBuf = cpy_wiki_buf(sBuf, lenBuf, maxLenBuf, "</b>", 4);
  2471. break;
  2472. case WIKI_TAG_PAIR_ITALIC:
  2473. if (wiki_nodes[idxStart].bTagStart == 1)
  2474. lenBuf = cpy_wiki_buf(sBuf, lenBuf, maxLenBuf, "<i>", 3);
  2475. else
  2476. lenBuf = cpy_wiki_buf(sBuf, lenBuf, maxLenBuf, "</i>", 4);
  2477. break;
  2478. case WIKI_TAG_PAIR_NOWIKI:
  2479. if (wiki_nodes[idxStart].bTagStart == 1)
  2480. lenBuf = cpy_wiki_buf(sBuf, lenBuf, maxLenBuf, "<nowiki>", 8);
  2481. else
  2482. lenBuf = cpy_wiki_buf(sBuf, lenBuf, maxLenBuf, "</nowiki>", 9);
  2483. break;
  2484. case WIKI_TAG_PAIR_PRE:
  2485. if (wiki_nodes[idxStart].bTagStart == 1)
  2486. lenBuf = cpy_wiki_buf(sBuf, lenBuf, maxLenBuf, "<pre>", 5);
  2487. else
  2488. lenBuf = cpy_wiki_buf(sBuf, lenBuf, maxLenBuf, "</pre><br>", 10);
  2489. break;
  2490. case WIKI_TAG_PAIR_PRE_LINE:
  2491. if (wiki_nodes[idxStart].bTagStart == 1)
  2492. lenBuf = cpy_wiki_buf(sBuf, lenBuf, maxLenBuf, "<pre>", 5);
  2493. else
  2494. lenBuf = cpy_wiki_buf(sBuf, lenBuf, maxLenBuf, "</pre><br>", 10);
  2495. break;
  2496. case WIKI_TAG_PAIR_OL:
  2497. case WIKI_TAG_PAIR_UL:
  2498. case WIKI_TAG_PAIR_DT:
  2499. case WIKI_TAG_PAIR_DD:
  2500. if (wiki_nodes[idxStart].bTagStart)
  2501. {
  2502. lenBuf = process_wiki_list(idxStart, idxEnd, sText, sBuf, lenBuf, maxLenBuf);
  2503. idxStart = idxEnd + 1;
  2504. bIdxIncreased = 1;
  2505. }
  2506. break;
  2507. case WIKI_TAG_PAIR_TEXT:
  2508. lenBuf = cpy_wiki_buf(sBuf, lenBuf, maxLenBuf, wiki_nodes[idxStart].pTag, wiki_nodes[idxStart].lenTag);
  2509. break;
  2510. default:
  2511. break;
  2512. }
  2513. if (!bIdxIncreased)
  2514. {
  2515. idxStart++;
  2516. }
  2517. }
  2518. *idxNode = idxStart;
  2519. sBuf[lenBuf] = '\0';
  2520. return lenBuf;
  2521. }
  2522. int in_wiki_table_cells(int *wiki_tag_stack, int nWikiTagStack)
  2523. {
  2524. int i;
  2525. for (i = 0; i < nWikiTagStack; i++)
  2526. if (wiki_tag_stack[i] == WIKI_TAG_PAIR_TABLE_CELLS)
  2527. return 1;
  2528. return 0;
  2529. }
  2530. void locate_wiki_tags(char *sText, long nTextLen)
  2531. {
  2532. int wiki_tag_stack[MAX_WIKI_TAG_STACK];
  2533. int nWikiTagStack = 0;
  2534. char *pText = sText;
  2535. int idxCurrentTag;
  2536. char *pContentBeforeTag = NULL;
  2537. long lenContentBeforeTag = 0;
  2538. int bFound;
  2539. int i;
  2540. int bBeginOfLine = 1;
  2541. char *pTemplateStart;
  2542. char *pTemplateEnd;
  2543. char *pCellDelimeter;
  2544. char *pEndOfLine;
  2545. nWikiNodeCount = 0;
  2546. idxCurrentTag = -1;
  2547. while (nTextLen > 0)
  2548. {
  2549. bFound = 0;
  2550. if (idxCurrentTag == WIKI_TAG_PAIR_TEMPLATE) // special processing for filtering templates
  2551. {
  2552. while (idxCurrentTag == WIKI_TAG_PAIR_TEMPLATE && nTextLen > 0)
  2553. {
  2554. pTemplateStart = strnstr(pText, "{{", nTextLen);
  2555. pTemplateEnd = strnstr(pText, "}}", nTextLen);
  2556. if (in_wiki_table_cells(wiki_tag_stack, nWikiTagStack)) // special processing for missing }} before ||
  2557. {
  2558. pEndOfLine = strnstr(pText, "\n", nTextLen);
  2559. pCellDelimeter = strnstr(pText, "||", nTextLen);
  2560. if (pEndOfLine && pCellDelimeter && pEndOfLine < pCellDelimeter)
  2561. pCellDelimeter = NULL;
  2562. }
  2563. else
  2564. pCellDelimeter = NULL;
  2565. if (pCellDelimeter)
  2566. {
  2567. if (pTemplateStart && pTemplateStart > pCellDelimeter) // no counting {{ or }} after ||
  2568. pTemplateStart = NULL;
  2569. if (pTemplateEnd && pTemplateEnd > pCellDelimeter) // no counting {{ or }} after ||
  2570. pTemplateEnd = NULL;
  2571. if (!pTemplateStart && !pTemplateEnd) // set missing }}
  2572. {
  2573. while (nWikiTagStack > 0 && wiki_tag_stack[nWikiTagStack - 1] == WIKI_TAG_PAIR_TEMPLATE)
  2574. {
  2575. nWikiTagStack--;
  2576. wiki_nodes[nWikiNodeCount].idxTag = WIKI_TAG_PAIR_TEMPLATE;
  2577. wiki_nodes[nWikiNodeCount].bTagStart = 0;
  2578. wiki_nodes[nWikiNodeCount].pTag = pText - 2;
  2579. wiki_nodes[nWikiNodeCount++].lenTag = 2;
  2580. lenContentBeforeTag = 0;
  2581. pContentBeforeTag = NULL;
  2582. }
  2583. if (nWikiTagStack > 0)
  2584. idxCurrentTag = wiki_tag_stack[nWikiTagStack - 1];
  2585. else
  2586. idxCurrentTag = -1;
  2587. continue;
  2588. }
  2589. }
  2590. if (!pTemplateEnd) // no end of template found
  2591. {
  2592. nTextLen = 0;
  2593. continue;
  2594. }
  2595. else if (pTemplateStart && pTemplateEnd && pTemplateStart < pTemplateEnd)
  2596. {
  2597. if (nWikiTagStack < MAX_WIKI_TAG_STACK)
  2598. wiki_tag_stack[nWikiTagStack++] = WIKI_TAG_PAIR_TEMPLATE;
  2599. nTextLen -= pTemplateStart - pText + 2;
  2600. pText = pTemplateStart + 2;
  2601. }
  2602. else
  2603. {
  2604. nWikiTagStack--;
  2605. if (nWikiTagStack > 0)
  2606. idxCurrentTag = wiki_tag_stack[nWikiTagStack - 1];
  2607. else
  2608. idxCurrentTag = -1;
  2609. if (idxCurrentTag != WIKI_TAG_PAIR_TEMPLATE)
  2610. {
  2611. wiki_nodes[nWikiNodeCount].idxTag = WIKI_TAG_PAIR_TEMPLATE;
  2612. wiki_nodes[nWikiNodeCount].bTagStart = 0;
  2613. wiki_nodes[nWikiNodeCount].pTag = pTemplateEnd;
  2614. wiki_nodes[nWikiNodeCount++].lenTag = 2;
  2615. lenContentBeforeTag = 0;
  2616. pContentBeforeTag = NULL;
  2617. }
  2618. nTextLen -= pTemplateEnd - pText + 2;
  2619. pText = pTemplateEnd + 2;
  2620. }
  2621. }
  2622. continue;
  2623. }
  2624. if (0 <= idxCurrentTag && idxCurrentTag < WIKI_TAG_PAIR_TEXT && wiki_tag_pairs[idxCurrentTag].lenTagEnd > 0 &&
  2625. !memcmp(pText, wiki_tag_pairs[idxCurrentTag].sTagEnd, wiki_tag_pairs[idxCurrentTag].lenTagEnd))
  2626. {
  2627. if (lenContentBeforeTag > 0)
  2628. {
  2629. wiki_nodes[nWikiNodeCount].idxTag = WIKI_TAG_PAIR_TEXT;
  2630. wiki_nodes[nWikiNodeCount].bTagStart = 1;
  2631. wiki_nodes[nWikiNodeCount].pTag = pContentBeforeTag;
  2632. wiki_nodes[nWikiNodeCount++].lenTag = lenContentBeforeTag;
  2633. lenContentBeforeTag = 0;
  2634. pContentBeforeTag = NULL;
  2635. }
  2636. bFound = 1;
  2637. wiki_nodes[nWikiNodeCount].idxTag = idxCurrentTag;
  2638. wiki_nodes[nWikiNodeCount].bTagStart = 0;
  2639. wiki_nodes[nWikiNodeCount].pTag = pText;
  2640. wiki_nodes[nWikiNodeCount].lenTag = wiki_tag_pairs[idxCurrentTag].lenTagEnd;
  2641. pText += wiki_tag_pairs[idxCurrentTag].lenTagEnd;
  2642. nTextLen -= wiki_tag_pairs[idxCurrentTag].lenTagEnd;
  2643. switch (idxCurrentTag)
  2644. {
  2645. case WIKI_TAG_PAIR_LINK:
  2646. if (nTextLen > 0 && *pText == ']') // fix the "[[xxx]" problem
  2647. {
  2648. pText++;
  2649. nTextLen--;
  2650. wiki_nodes[nWikiNodeCount].lenTag++;
  2651. }
  2652. break;
  2653. default:
  2654. break;
  2655. }
  2656. nWikiNodeCount++;
  2657. if (nWikiTagStack > 0)
  2658. {
  2659. if (idxCurrentTag == wiki_tag_stack[nWikiTagStack - 1])
  2660. {
  2661. nWikiTagStack--;
  2662. }
  2663. }
  2664. if (nWikiTagStack > 0)
  2665. idxCurrentTag = wiki_tag_stack[nWikiTagStack - 1];
  2666. else
  2667. idxCurrentTag = -1;
  2668. }
  2669. if (!bFound && (idxCurrentTag < 0 || idxCurrentTag >= WIKI_TAG_PAIR_TEXT || wiki_tag_pairs[idxCurrentTag].bGotChild))
  2670. {
  2671. for (i=0; i < MAX_WIKI_TAG_PAIRS && !bFound; i++)
  2672. {
  2673. if (nTextLen >= wiki_tag_pairs[i].lenTagStart &&
  2674. !memcmp(pText, wiki_tag_pairs[i].sTagStart, wiki_tag_pairs[i].lenTagStart) &&
  2675. (!wiki_tag_pairs[i].bBeginOfLine || bBeginOfLine))
  2676. {
  2677. if (lenContentBeforeTag > 0)
  2678. {
  2679. wiki_nodes[nWikiNodeCount].idxTag = WIKI_TAG_PAIR_TEXT;
  2680. wiki_nodes[nWikiNodeCount].bTagStart = 1;
  2681. wiki_nodes[nWikiNodeCount].pTag = pContentBeforeTag;
  2682. wiki_nodes[nWikiNodeCount++].lenTag = lenContentBeforeTag;
  2683. lenContentBeforeTag = 0;
  2684. pContentBeforeTag = NULL;
  2685. }
  2686. bFound = 1;
  2687. if (wiki_tag_pairs[i].lenTagEnd > 0)
  2688. {
  2689. idxCurrentTag = i;
  2690. if (nWikiTagStack < MAX_WIKI_TAG_STACK)
  2691. wiki_tag_stack[nWikiTagStack++] = idxCurrentTag;
  2692. }
  2693. else
  2694. idxCurrentTag = -1;
  2695. wiki_nodes[nWikiNodeCount].idxTag = i;
  2696. wiki_nodes[nWikiNodeCount].bTagStart = 1;
  2697. wiki_nodes[nWikiNodeCount].pTag = pText;
  2698. wiki_nodes[nWikiNodeCount].lenTag = wiki_tag_pairs[idxCurrentTag].lenTagStart;
  2699. pText += wiki_tag_pairs[i].lenTagStart;
  2700. nTextLen -= wiki_tag_pairs[i].lenTagStart;
  2701. switch (idxCurrentTag)
  2702. {
  2703. case WIKI_TAG_PAIR_OL:
  2704. case WIKI_TAG_PAIR_UL:
  2705. case WIKI_TAG_PAIR_DT:
  2706. case WIKI_TAG_PAIR_DD:
  2707. while (nTextLen > 0 && (*pText == '#' || *pText == '*' || *pText == ';' || *pText == ':'))
  2708. {
  2709. pText++;
  2710. nTextLen--;
  2711. wiki_nodes[nWikiNodeCount].lenTag++;
  2712. }
  2713. if (*pText == ' ')
  2714. {
  2715. pText++;
  2716. nTextLen--;
  2717. }
  2718. break;
  2719. default:
  2720. break;
  2721. }
  2722. nWikiNodeCount++;
  2723. }
  2724. }
  2725. for (i=0; i < MAX_WIKI_TAG_PAIRS && !bFound; i++)
  2726. {
  2727. if (wiki_tag_pairs[i].lenTagEnd && nTextLen >= wiki_tag_pairs[i].lenTagEnd &&
  2728. strcmp(wiki_tag_pairs[i].sTagEnd, "\n") && // not special end tag
  2729. !memcmp(pText, wiki_tag_pairs[i].sTagEnd, wiki_tag_pairs[i].lenTagEnd))
  2730. {
  2731. if (lenContentBeforeTag > 0)
  2732. {
  2733. wiki_nodes[nWikiNodeCount].idxTag = WIKI_TAG_PAIR_TEXT;
  2734. wiki_nodes[nWikiNodeCount].bTagStart = 1;
  2735. wiki_nodes[nWikiNodeCount].pTag = pContentBeforeTag;
  2736. wiki_nodes[nWikiNodeCount++].lenTag = lenContentBeforeTag;
  2737. lenContentBeforeTag = 0;
  2738. pContentBeforeTag = NULL;
  2739. }
  2740. bFound = 1;
  2741. idxCurrentTag = i;
  2742. wiki_nodes[nWikiNodeCount].idxTag = idxCurrentTag;
  2743. wiki_nodes[nWikiNodeCount].bTagStart = 0;
  2744. wiki_nodes[nWikiNodeCount].pTag = pText;
  2745. wiki_nodes[nWikiNodeCount++].lenTag = wiki_tag_pairs[idxCurrentTag].lenTagEnd;
  2746. pText += wiki_tag_pairs[i].lenTagStart;
  2747. nTextLen -= wiki_tag_pairs[i].lenTagStart;
  2748. if (nTextLen > 0 && *pText == ']') // fix the "[[xxx]" problem
  2749. {
  2750. pText++;
  2751. nTextLen--;
  2752. }
  2753. if (nWikiTagStack > 0)
  2754. {
  2755. if (idxCurrentTag == wiki_tag_stack[nWikiTagStack - 1])
  2756. {
  2757. nWikiTagStack--;
  2758. }
  2759. }
  2760. if (nWikiTagStack > 0)
  2761. idxCurrentTag = wiki_tag_stack[nWikiTagStack - 1];
  2762. else
  2763. idxCurrentTag = -1;
  2764. }
  2765. }
  2766. }
  2767. if (!bFound)
  2768. {
  2769. if (!pContentBeforeTag)
  2770. pContentBeforeTag = pText;
  2771. if (nTextLen > 0)
  2772. lenContentBeforeTag++;
  2773. pText++;
  2774. nTextLen--;
  2775. }
  2776. if (*(pText - 1) == '\n')
  2777. bBeginOfLine = 1;
  2778. else
  2779. bBeginOfLine = 0;
  2780. }
  2781. if (lenContentBeforeTag > 0)
  2782. {
  2783. wiki_nodes[nWikiNodeCount].idxTag = WIKI_TAG_PAIR_TEXT;
  2784. wiki_nodes[nWikiNodeCount].bTagStart = 1;
  2785. wiki_nodes[nWikiNodeCount].pTag = pContentBeforeTag;
  2786. wiki_nodes[nWikiNodeCount++].lenTag = lenContentBeforeTag;
  2787. }
  2788. }
  2789. long parse_wiki_tags(char *sText, long nTextLen, char *sBuf, long maxLenBuf)
  2790. {
  2791. int lenBuf = 0;
  2792. int idxWikiNode = 0;
  2793. locate_wiki_tags(sText, nTextLen);
  2794. if (msgLevel() >= 5)
  2795. dump_wiki_nodes();
  2796. while (idxWikiNode < nWikiNodeCount)
  2797. {
  2798. if (wiki_nodes[idxWikiNode].bTagStart)
  2799. {
  2800. lenBuf = process_wiki_tag(&idxWikiNode, nWikiNodeCount - 1, sText, sBuf, lenBuf, maxLenBuf, 0);
  2801. }
  2802. else
  2803. idxWikiNode++;
  2804. }
  2805. sBuf[lenBuf] = '\0';
  2806. lenBuf = replace_ampersand_char_crlf(sText, sBuf);
  2807. return lenBuf;
  2808. }
  2809. int get_next_cell(char **pRow, int *lenRow, char *sDelimiter, char *sCellText)
  2810. {
  2811. char *p, *p2, *p3;
  2812. char *pNextCell;
  2813. int lenCell;
  2814. int rc = 0;
  2815. char sLocalBuf[MAX_CELL_STRING];
  2816. sCellText[0] = '\0';
  2817. if (*lenRow > 0)
  2818. {
  2819. if (sDelimiter && (p = strnstr(*pRow, sDelimiter, *lenRow)) != NULL)
  2820. {
  2821. lenCell = p - (*pRow);
  2822. (*lenRow) -= lenCell + strlen(sDelimiter);
  2823. pNextCell = p + strlen(sDelimiter);
  2824. }
  2825. else
  2826. {
  2827. lenCell = *lenRow;
  2828. (*lenRow) = 0;
  2829. pNextCell = (*pRow) + lenCell;
  2830. }
  2831. p = strnchr(*pRow, '|', lenCell);
  2832. p2 = strnstr(*pRow, "{{", lenCell);
  2833. p3 = strnstr(*pRow, "[[", lenCell);
  2834. if (p && ((p2 && p2 < p) || (p3 && p3 < p))) // resolve the confusion of '|' in {{ }} or in [[ ]]
  2835. p = NULL;
  2836. if (p)
  2837. {
  2838. lenCell -= p - (*pRow) + 1;
  2839. if (lenCell >= MAX_CELL_STRING)
  2840. lenCell = MAX_CELL_STRING - 1;
  2841. memcpy(sCellText, p + 1, lenCell);
  2842. }
  2843. else
  2844. {
  2845. if (lenCell >= MAX_CELL_STRING)
  2846. lenCell = MAX_CELL_STRING - 1;
  2847. memcpy(sCellText, (*pRow), lenCell);
  2848. }
  2849. sCellText[lenCell] = '\0';
  2850. // trim_blanks(sCellText, lenCell);
  2851. // parse_wiki_tags(sCellText, lenCell, sLocalBuf, MAX_CELL_STRING);
  2852. rc = 1;
  2853. *pRow = pNextCell;
  2854. }
  2855. return rc;
  2856. }
  2857. int parse_list_stack(char *sListStack, int *nListStack, char **pTagContent, int *lenTagContent)
  2858. {
  2859. int bDone = 0;
  2860. int rc = 0;
  2861. while (!bDone && *lenTagContent > 0 && *nListStack < MAX_LIST_DEPTH - 1)
  2862. {
  2863. switch (**pTagContent)
  2864. {
  2865. case '#':
  2866. sListStack[(*nListStack)++] = 'O';
  2867. rc = LIST_LI;
  2868. break;
  2869. case '*':
  2870. sListStack[(*nListStack)++] = 'U';
  2871. rc = LIST_LI;
  2872. break;
  2873. case ';':
  2874. sListStack[(*nListStack)++] = 'D';
  2875. rc = LIST_DT;
  2876. break;
  2877. case ':':
  2878. sListStack[(*nListStack)++] = 'D';
  2879. rc = LIST_DD;
  2880. break;
  2881. default:
  2882. bDone = 1;
  2883. }
  2884. if (!bDone)
  2885. {
  2886. (*pTagContent)++;
  2887. (*lenTagContent)--;
  2888. }
  2889. }
  2890. return rc;
  2891. }
  2892. long wikiRender(MYSQL *conn, char *sRendered, long *nRenderedLines, int *nArticleLinks, int *nExternalLinks,
  2893. char *sTitle, int nTitleLen, char *sText, long nTextLen)
  2894. {
  2895. int i;
  2896. ARTICLE_HEADER article_header;
  2897. ARTICLE_LINK article_links[MAX_LINKS];
  2898. EXTERNAL_LINK external_links[MAX_LINKS];
  2899. char sExternalLinkStrings[MAX_LINK_STRING * MAX_LINKS];
  2900. // int len;
  2901. long lenExternalLinkStrings;
  2902. long idxLinkedArticle;
  2903. long lenRendered = 0;
  2904. char *sBuf;
  2905. sBuf = (char *)malloc(MAX_TEXT_BUF);
  2906. if (!sBuf)
  2907. {
  2908. showMsg(0, "wikiRender malloc error\n");
  2909. exit(-1);
  2910. }
  2911. render_buf.nLines = 0;
  2912. render_buf.current_x = -1;
  2913. render_buf.current_y = 0;
  2914. render_buf.current_indent = 0;
  2915. render_buf.list_depth = 0;
  2916. render_buf.nLinks = 0;
  2917. render_buf.nCurrentFontIdx = DEFAULT_FONT_IDX;
  2918. render_buf.nCurrentRenderType = RENDER_TYPE_NORMAL;
  2919. nArticleNodeCount = 0;
  2920. nTextLen = parse_wiki_tags(sText, nTextLen, sBuf, MAX_TEXT_BUF);
  2921. free(sBuf);
  2922. showMsg(3, "after parse_wiki_tags\n==========\n[%s]\n==========\n", sText);
  2923. nArticleNodeCount = 0;
  2924. render_title(sTitle, (long)nTitleLen);
  2925. render_wiki_text(sText, nTextLen);
  2926. article_header.article_link_count = 0;
  2927. article_header.external_link_count = 0;
  2928. lenExternalLinkStrings = 0;
  2929. for (i=0; i < render_buf.nLinks; i++)
  2930. {
  2931. if ((idxLinkedArticle = GetArticleIdxByTitle(conn, links[i].sLink)) >= 0)
  2932. {
  2933. article_links[article_header.article_link_count].start_xy = (unsigned long)(links[i].start_x | (links[i].start_y << 8));
  2934. article_links[article_header.article_link_count].end_xy = (unsigned long)(links[i].end_x | (links[i].end_y << 8));
  2935. article_links[article_header.article_link_count].article_id = idxLinkedArticle;
  2936. article_header.article_link_count++;
  2937. }
  2938. // else
  2939. // {
  2940. // external_links[article_header.external_link_count].start_xy = (unsigned long)(links[i].start_x | (links[i].start_y << 8));
  2941. // external_links[article_header.external_link_count].end_xy = (unsigned long)(links[i].end_x | (links[i].end_y << 8));
  2942. // external_links[article_header.external_link_count].offset_link_str = lenExternalLinkStrings;
  2943. // len = strlen(links[i].sLink);
  2944. // external_links[article_header.external_link_count].link_str_len = len;
  2945. // strncpy(&sExternalLinkStrings[lenExternalLinkStrings], links[i].sLink, len);
  2946. // lenExternalLinkStrings += len;
  2947. // article_header.external_link_count++;
  2948. // }
  2949. }
  2950. article_header.offset_article = sizeof(article_header) +
  2951. article_header.article_link_count * sizeof(ARTICLE_LINK) +
  2952. article_header.external_link_count * sizeof(EXTERNAL_LINK) +
  2953. lenExternalLinkStrings;
  2954. memcpy(sRendered, &article_header, sizeof(article_header));
  2955. lenRendered += sizeof(article_header);
  2956. if (article_header.article_link_count)
  2957. {
  2958. memcpy(&sRendered[lenRendered], &article_links[0], sizeof(ARTICLE_LINK) * article_header.article_link_count);
  2959. lenRendered += sizeof(ARTICLE_LINK) * article_header.article_link_count;
  2960. }
  2961. if (article_header.external_link_count)
  2962. {
  2963. memcpy(&sRendered[lenRendered], &external_links[0], sizeof(EXTERNAL_LINK) * article_header.external_link_count);
  2964. lenRendered += sizeof(EXTERNAL_LINK) * article_header.external_link_count;
  2965. }
  2966. if (lenExternalLinkStrings)
  2967. {
  2968. memcpy(&sRendered[lenRendered], sExternalLinkStrings, lenExternalLinkStrings);
  2969. lenRendered += lenExternalLinkStrings;
  2970. }
  2971. for (i=0; i < render_buf.nLines && lenRendered <= MAX_RENDERED - LCD_BUF_WIDTH_BYTES; i++)
  2972. {
  2973. memcpy(&sRendered[lenRendered], render_buf_sLines[i], strlen(render_buf_sLines[i]));
  2974. lenRendered += strlen(render_buf_sLines[i]);
  2975. }
  2976. *nRenderedLines = render_buf.current_y;
  2977. *nArticleLinks = article_header.article_link_count;
  2978. *nExternalLinks = article_header.external_link_count;
  2979. return lenRendered;
  2980. }
  2981. int strnChrOffset(char *s, char c, int len)
  2982. {
  2983. int i;
  2984. for (i=0; i < len && s[i]; i++)
  2985. if (s[i] == c)
  2986. return i;
  2987. return -1;
  2988. }
  2989. void build_title_search(char *sTitle, char *sTitleSearch, char *sFirstTwoChars, char *sSecondTwoChars)
  2990. {
  2991. int i;
  2992. char c;
  2993. memset (sFirstTwoChars, 0, 3);
  2994. memset (sSecondTwoChars, 0, 3);
  2995. i = 0;
  2996. while ((c=*sTitle++) != 0)
  2997. {
  2998. if (c == '_')
  2999. c = ' ';
  3000. else if (c == '~')
  3001. c = ':';
  3002. if (is_supported_search_char(c))
  3003. {
  3004. switch (i)
  3005. {
  3006. case 0:
  3007. sFirstTwoChars[0] = c;
  3008. break;
  3009. case 1:
  3010. sFirstTwoChars[1] = c;
  3011. break;
  3012. case 2:
  3013. sSecondTwoChars[0] = c;
  3014. break;
  3015. case 3:
  3016. sSecondTwoChars[1] = c;
  3017. break;
  3018. default:
  3019. break;
  3020. }
  3021. sTitleSearch[i++] = c;
  3022. }
  3023. }
  3024. sTitleSearch[i] = '\0';
  3025. }
  3026. int is_redirect(char *s)
  3027. {
  3028. char redirect[9];
  3029. int len = 0;
  3030. while (len < 9 && *s)
  3031. {
  3032. if (*s == ' ' || *s == '\t')
  3033. s++;
  3034. else
  3035. {
  3036. if ('a' <= *s && *s <= 'z')
  3037. redirect[len] = *s - 'a' + 'A';
  3038. else
  3039. redirect[len] = *s;
  3040. s++;
  3041. len++;
  3042. }
  3043. }
  3044. if (len >= 9 && !strncmp("#REDIRECT", redirect, 9))
  3045. return 1;
  3046. else
  3047. return 0;
  3048. }
  3049. off64_t next_article(FILE *fd, off64_t file_offset_for_pass_1, char *sTitle, char *sRedirect,
  3050. long *nArticleId, off64_t *nArticleOffset, long *nArticleLen, int *nType)
  3051. {
  3052. char buf[4096];
  3053. char *pBufS;
  3054. char *pBufE;
  3055. int lenBuf = 0;
  3056. int posBuf = 0;
  3057. char sArticleId[16];
  3058. char sText[MAX_TITLE_LEN];
  3059. char sTag[32];
  3060. int nTagContentStartPos;
  3061. int nTagContentEndPos;
  3062. int bDone = 0;
  3063. int bEof = 0;
  3064. int nOffsetS;
  3065. int nOffsetE;
  3066. int bInTitle = 0;
  3067. int bInArticleId = 0;
  3068. int bInText = 0;
  3069. int nCopyLen;
  3070. int nTagDepth = 0;
  3071. showMsg(3, "entering next_articlex file_offset_for_pass_1 - %ld\n", file_offset_for_pass_1);
  3072. sTitle[0] = '\0';
  3073. sRedirect[0] = '\0';
  3074. sArticleId[0] = '\0';
  3075. sText[0] = '\0';
  3076. sTag[0] = '\0';
  3077. *nArticleId = 0;
  3078. *nArticleOffset = 0;
  3079. *nArticleLen = 0;
  3080. *nType = 0;
  3081. fseeko64(fd, file_offset_for_pass_1, SEEK_SET);
  3082. while (!bDone && (posBuf < lenBuf ||!bEof))
  3083. {
  3084. if (posBuf >= lenBuf)
  3085. {
  3086. lenBuf = fread(buf, 1, sizeof(buf), fd);
  3087. bEof = feof(fd);
  3088. if (lenBuf < 0)
  3089. lenBuf = 0;
  3090. posBuf = 0;
  3091. file_offset_for_pass_1 += lenBuf;
  3092. }
  3093. nOffsetS = strnChrOffset(&buf[posBuf], '<', lenBuf - posBuf);
  3094. if (nOffsetS >= 0)
  3095. nOffsetS += posBuf;
  3096. if (nOffsetS >=0)
  3097. {
  3098. nOffsetE = strnChrOffset(&buf[nOffsetS], '>', lenBuf - nOffsetS);
  3099. if (lenBuf > nOffsetS && nOffsetE >= 0)
  3100. {
  3101. nOffsetE += nOffsetS;
  3102. posBuf = nOffsetE + 1;
  3103. if (buf[nOffsetE - 1] != '/')
  3104. {
  3105. nTagContentEndPos = nOffsetS - 1;
  3106. nCopyLen = nOffsetE - nOffsetS - 1;
  3107. if (nCopyLen > sizeof(sTag) - 1)
  3108. nCopyLen = sizeof(sTag) - 1;
  3109. memcpy(sTag, &buf[nOffsetS +1], nCopyLen);
  3110. sTag[nCopyLen] = '\0';
  3111. if (!strcmp(sTag, "page"))
  3112. nTagDepth = 0;
  3113. if ((nOffsetS = strnChrOffset(sTag, ' ', strlen(sTag))) >= 0)
  3114. sTag[nOffsetS] = '\0';
  3115. if (sTag[0] == '/' && nTagDepth > 0)
  3116. {
  3117. nTagDepth--;
  3118. if (!nTagDepth)
  3119. {
  3120. bDone = 1;
  3121. file_offset_for_pass_1 = file_offset_for_pass_1 - (lenBuf - nOffsetE - 1);
  3122. fseeko64(fd, file_offset_for_pass_1, SEEK_SET);
  3123. }
  3124. else if (!strcmp(sTag, "/title") && nTagDepth == 1)
  3125. {
  3126. nCopyLen = nTagContentEndPos - nTagContentStartPos + 1;
  3127. if (nCopyLen > MAX_TITLE_LEN - strlen(sTitle) - 1)
  3128. nCopyLen = MAX_TITLE_LEN - strlen(sTitle) - 1;
  3129. strncat(sTitle, &buf[nTagContentStartPos], nCopyLen);
  3130. bInTitle = 0;
  3131. }
  3132. else if (!strcmp(sTag, "/id") && nTagDepth == 1)
  3133. {
  3134. nCopyLen = nTagContentEndPos - nTagContentStartPos + 1;
  3135. if (nCopyLen > sizeof(sArticleId) - 1)
  3136. nCopyLen = sizeof(sArticleId) - 1;
  3137. strncat(sArticleId, &buf[nTagContentStartPos], nCopyLen);
  3138. *nArticleId = atol(sArticleId);
  3139. bInArticleId = 0;
  3140. }
  3141. else if (!strcmp(sTag, "/text") && nTagDepth == 2)
  3142. {
  3143. nCopyLen = nTagContentEndPos - nTagContentStartPos + 1;
  3144. if (nCopyLen > MAX_TITLE_LEN - strlen(sText) - 1)
  3145. nCopyLen = MAX_TITLE_LEN - strlen(sText) - 1;
  3146. strncat(sText, &buf[nTagContentStartPos], nCopyLen);
  3147. *nArticleLen = file_offset_for_pass_1 - (lenBuf - nTagContentEndPos) - *nArticleOffset + 1;
  3148. if (is_redirect(sText))
  3149. {
  3150. *nType = 1;
  3151. pBufS = strstr(sText, "[[");
  3152. pBufE = strstr(sText, "]]");
  3153. if (pBufS && pBufE)
  3154. {
  3155. nOffsetS = pBufS - sText;
  3156. nOffsetE = pBufE - sText;
  3157. strncat(sRedirect, &sText[nOffsetS+2], nOffsetE - nOffsetS - 2);
  3158. }
  3159. }
  3160. bInText = 0;
  3161. }
  3162. }
  3163. else if (nTagDepth || !strcmp(sTag, "page"))
  3164. {
  3165. nTagDepth++;
  3166. if (!strcmp(sTag, "text") && nTagDepth == 3)
  3167. {
  3168. *nArticleOffset = file_offset_for_pass_1 - (lenBuf - nOffsetE - 1);
  3169. bInText = 1;
  3170. }
  3171. else if (!strcmp(sTag, "id") && nTagDepth == 2)
  3172. bInArticleId = 1;
  3173. if (!strcmp(sTag, "title") && nTagDepth == 2)
  3174. bInTitle = 1;
  3175. nTagContentStartPos = nOffsetE + 1;
  3176. }
  3177. }
  3178. }
  3179. else
  3180. {
  3181. if (nOffsetS > 0)
  3182. file_offset_for_pass_1 = file_offset_for_pass_1 - (lenBuf - nOffsetS + 1);
  3183. fseeko64(fd, file_offset_for_pass_1, SEEK_SET);
  3184. posBuf = lenBuf;
  3185. }
  3186. }
  3187. else
  3188. {
  3189. if (bInTitle)
  3190. {
  3191. nCopyLen = lenBuf - posBuf;
  3192. if (nCopyLen > MAX_TITLE_LEN - strlen(sTitle) - 1)
  3193. nCopyLen = MAX_TITLE_LEN - strlen(sTitle) - 1;
  3194. strncat(sTitle, &buf[posBuf], nCopyLen);
  3195. }
  3196. else if (bInArticleId)
  3197. {
  3198. nCopyLen = lenBuf - posBuf;
  3199. if (nCopyLen > sizeof(sArticleId) - strlen(sArticleId) - 1)
  3200. nCopyLen = sizeof(sArticleId) - strlen(sArticleId) - 1;
  3201. strncat(sArticleId, &buf[posBuf], nCopyLen);
  3202. }
  3203. else if (bInText)
  3204. {
  3205. nCopyLen = lenBuf - posBuf;
  3206. if (nCopyLen > MAX_TITLE_LEN - strlen(sText) - 1)
  3207. nCopyLen = MAX_TITLE_LEN - strlen(sText) - 1;
  3208. strncat(sText, &buf[posBuf], nCopyLen);
  3209. }
  3210. posBuf = lenBuf;
  3211. }
  3212. }
  3213. if (!strncmp(sTitle, "Template:", 9))
  3214. {
  3215. *nType = 2;
  3216. memcpy(sTitle, &sTitle[9], strlen(sTitle) - 8);
  3217. }
  3218. else if (unsupported_article(sTitle))
  3219. {
  3220. *nType = 3;
  3221. }
  3222. showMsg(3, "exiting next_article title[%s], file_offset_for_pass_1 - %Ld\n", sTitle, file_offset_for_pass_1);
  3223. return file_offset_for_pass_1;
  3224. }
  3225. void increase_bigram(MYSQL *conn, char *sBigramChars)
  3226. {
  3227. char sSQL[MAX_SQL_STR];
  3228. sprintf(sSQL, "update bigram set occurrences=occurrences+1 where bigram_chars = '%s'", sBigramChars);
  3229. mysql_query(conn, sSQL);
  3230. }
  3231. void process_pass_1(MYSQL *conn, char *sFileName, int msgLevel, long titlesToProcess,
  3232. off64_t file_offset_for_pass_1, long max_article_idx)
  3233. {
  3234. FILE *fd;
  3235. char sTitle[MAX_TITLE_LEN];
  3236. char sTitleSearch[MAX_TITLE_LEN];
  3237. char sFirstTwoChars[3];
  3238. char sSecondTwoChars[3];
  3239. char sRedirect[MAX_TITLE_LEN];
  3240. long nArticleId;
  3241. off64_t nArticleOffset;
  3242. long nArticleLen;
  3243. off64_t nFileSize;
  3244. int rc;
  3245. int nType;
  3246. char sSQL[MAX_SQL_STR];
  3247. long titlesProcessed = 0;
  3248. int bDone = 0;
  3249. fd = fopen64(sFileName, "rb");
  3250. if (!fd)
  3251. {
  3252. showMsg(0, "cannot open file %s, error: %s\n", sFileName, strerror(errno));
  3253. exit(-1);
  3254. }
  3255. fseeko64(fd, 0, SEEK_END);
  3256. nFileSize = ftello64(fd);
  3257. processing_speed(0);
  3258. while (file_offset_for_pass_1 < nFileSize && titlesProcessed < titlesToProcess)
  3259. {
  3260. file_offset_for_pass_1 = next_article(fd, file_offset_for_pass_1, sTitle, sRedirect,
  3261. &nArticleId, &nArticleOffset, &nArticleLen, &nType);
  3262. showMsg(3, "title[%s], redirect[%s], id[%ld], offset[%ld], len[%ld], template[%d]\n",
  3263. sTitle, sRedirect, nArticleId, nArticleOffset, nArticleLen, nType);
  3264. if (sTitle[0])
  3265. {
  3266. titlesProcessed++;
  3267. if (file_offset_for_pass_1 >= nFileSize)
  3268. {
  3269. showMsg(0, "Pass 1 completed\n");
  3270. }
  3271. else if (nType == 0 || nType == 1)
  3272. {
  3273. build_title_search(sTitle, sTitleSearch, sFirstTwoChars, sSecondTwoChars);
  3274. if (sTitleSearch[0] == '\0')
  3275. continue;
  3276. if (!(titlesProcessed % 100)) // sampling bigram out of every 100 entries
  3277. {
  3278. increase_bigram(conn, sFirstTwoChars);
  3279. increase_bigram(conn, sSecondTwoChars);
  3280. mysql_commit(conn); // commit periodically
  3281. }
  3282. if (!(titlesProcessed % 10000))
  3283. processing_speed(titlesProcessed);
  3284. }
  3285. pad_backslash(sTitle, sizeof(sTitle));
  3286. pad_backslash(sTitleSearch, sizeof(sTitleSearch));
  3287. if (nType == 0)
  3288. {
  3289. max_article_idx++;
  3290. sprintf(sSQL, "insert into entries (title, title_search, idx, "
  3291. "entry_type, text_start_offset, text_len) values "
  3292. "('%s', '%s', %ld, %d, %Ld, %ld)",
  3293. sTitle, sTitleSearch, max_article_idx,
  3294. nType, nArticleOffset, nArticleLen);
  3295. }
  3296. else if (nType == 1)
  3297. {
  3298. pad_backslash(sRedirect, sizeof(sRedirect));
  3299. sprintf(sSQL, "insert into entries (title, title_search, "
  3300. "entry_type, redirect_title) values"
  3301. "('%s', '%s', %d, '%s')",
  3302. sTitle, sTitleSearch,
  3303. nType, sRedirect);
  3304. }
  3305. else
  3306. sprintf(sSQL, "insert into entries (title, entry_type, text_start_offset, text_len) values"
  3307. "('%s', %d, %Ld, %ld)", sTitle, nType, nArticleOffset, nArticleLen);
  3308. rc = mysql_query(conn, sSQL);
  3309. if (rc)
  3310. showMsg(0, "rc=%d, SQL - %s\n", rc, sSQL);
  3311. }
  3312. }
  3313. processing_speed(titlesProcessed);
  3314. mysql_commit(conn);
  3315. }
  3316. // drop the section reference in the redirect
  3317. void drop_pound_sign(char *s)
  3318. {
  3319. while (*s)
  3320. {
  3321. if (*s == '#')
  3322. *s = '\0'; // truncate starting from #
  3323. else
  3324. s++;
  3325. }
  3326. }
  3327. void create_folders_for_idx_range(long idxStart, long idxEnd)
  3328. {
  3329. long i;
  3330. char sPath[128];
  3331. mkdir("./dat", 0777);
  3332. for (i = idxStart / 1000000; i <= idxEnd / 1000000; i++)
  3333. {
  3334. sprintf(sPath, "./dat/%ld", i);
  3335. mkdir(sPath, 0777);
  3336. }
  3337. for (i = idxStart / 10000; i <= idxEnd / 10000; i++)
  3338. {
  3339. sprintf(sPath, "./dat/%ld/%ld", (i / 100) % 100, i % 100);
  3340. mkdir(sPath, 0777);
  3341. }
  3342. for (i = idxStart / 100; i <= idxEnd / 100; i++)
  3343. {
  3344. sprintf(sPath, "./dat/%ld/%ld/%ld", (i / 10000) % 100, (i / 100) % 100, i % 100);
  3345. mkdir(sPath, 0777);
  3346. }
  3347. }
  3348. void process_pass_2(MYSQL *conn, MYSQL *conn2, char *sFileName, int msgLevel, long titlesToProcess, int batch,
  3349. long idxStart, long idxEnd)
  3350. {
  3351. FILE *fdXml, *fdArticle, *fdIdx, *fdDat;
  3352. char sRenderFile[128]; /* The file name (including path) of the rendered file for single article */
  3353. char sTitle[MAX_TITLE_LEN];
  3354. long idxArticle;
  3355. off64_t nArticleOffset;
  3356. unsigned int nRenderedArticleLen;
  3357. unsigned int maxRenderedArticleLen = 0;
  3358. CLzmaEncProps props;
  3359. SizeT propsSize;
  3360. unsigned int nCompressedArticleLen;
  3361. unsigned int maxCompressedArticleLen = 0;
  3362. char *sRenderedCompressed;
  3363. long maxRenderedLines = 0;
  3364. int maxArticleLinks = 0;
  3365. long nRenderedLines;
  3366. int nArticleLinks;
  3367. int nExternalLinks;
  3368. int rc;
  3369. char sSQL[MAX_SQL_STR];
  3370. MYSQL_RES *res;
  3371. MYSQL_ROW row;
  3372. char *sTextBuf;
  3373. long nTextLen;
  3374. char *sRendered;
  3375. int bDone;
  3376. long idxCurEnd;
  3377. ARTICLE_PTR *articlePtrs;
  3378. long nIdxCount;
  3379. long maxIdx = 0;
  3380. long idxBase;
  3381. long titlesProcessed = 0;
  3382. processing_speed(0);
  3383. init_lcd_draw_buf();
  3384. create_folders_for_idx_range(idxStart, idxEnd);
  3385. sTextBuf = (char *)malloc(MAX_TEXT_BUF);
  3386. sRendered = (char *)malloc(MAX_RENDERED);
  3387. if (!sTextBuf || !sRendered)
  3388. {
  3389. showMsg(0, "process_pass_2 malloc error\n");
  3390. exit(-1);
  3391. }
  3392. if (!(wiki_nodes = (struct wiki_node *)malloc(sizeof(struct wiki_node) * MAX_WIKI_NODES)))
  3393. {
  3394. showMsg(0, "wiki_node allocation error\n");
  3395. exit(-1);
  3396. }
  3397. if (!(article_nodes = (struct article_node *)malloc(sizeof(struct article_node) * MAX_ARTICLE_NODES)))
  3398. {
  3399. showMsg(0, "article_nodes allocation error\n");
  3400. exit(-1);
  3401. }
  3402. fdXml = fopen64(sFileName, "rb");
  3403. if (!fdXml)
  3404. {
  3405. showMsg(0, "cannot open file %s, error: %s\n", sFileName, strerror(errno));
  3406. exit(-1);
  3407. }
  3408. if (batch >= 0)
  3409. {
  3410. char name[13];
  3411. sprintf(name, "pedia%d.idx", batch);
  3412. if (!(fdIdx = fopen(name, "wb")))
  3413. {
  3414. showMsg(0, "cannot open file %s, error: %s\n", name, strerror(errno));
  3415. exit(-1);
  3416. }
  3417. sprintf(name, "pedia%d.dat", batch);
  3418. if (!(fdDat = fopen(name, "wb")))
  3419. {
  3420. showMsg(0, "cannot open file %s, error: %s\n", name, strerror(errno));
  3421. exit(-1);
  3422. }
  3423. nIdxCount = idxEnd - idxStart + 1;
  3424. articlePtrs = (ARTICLE_PTR *)malloc(sizeof(ARTICLE_PTR) * nIdxCount);
  3425. if (!articlePtrs)
  3426. {
  3427. showMsg(0, "malloc articlePtrs error\n");
  3428. exit(1);
  3429. }
  3430. memset(articlePtrs, 0, sizeof(ARTICLE_PTR) * nIdxCount);
  3431. sRenderedCompressed = (char *)malloc(MAX_RENDERED);
  3432. if (!sRenderedCompressed)
  3433. {
  3434. showMsg(0, "malloc sRenderedCompressed error\n");
  3435. exit(1);
  3436. }
  3437. }
  3438. idxBase = idxStart;
  3439. bDone = 0;
  3440. while (!bDone)
  3441. {
  3442. idxCurEnd = idxStart + RESULT_SET_LIMIT - 1;
  3443. if (idxCurEnd >= idxEnd)
  3444. {
  3445. idxCurEnd = idxEnd;
  3446. bDone = 1;
  3447. }
  3448. sprintf(sSQL, "select title, text_start_offset, text_len, idx from entries "
  3449. "where %ld <= idx and idx <= %ld", idxStart, idxCurEnd);
  3450. rc = mysql_query(conn, sSQL);
  3451. if (rc)
  3452. {
  3453. showMsg(0, "query entries error - %d (%s)\n", rc, mysql_error(conn));
  3454. exit(1);
  3455. }
  3456. res = mysql_use_result(conn);
  3457. while ((row = mysql_fetch_row(res)) != NULL)
  3458. {
  3459. titlesProcessed++;
  3460. if (titlesProcessed % 10000 == 0)
  3461. processing_speed(titlesProcessed);
  3462. strcpy(sTitle, row[0]);
  3463. showMsg(2, "processing entry title[%s]\n", sTitle);
  3464. sscanf(row[1], "%Ld", &nArticleOffset);
  3465. nTextLen = atol(row[2]);
  3466. idxArticle = atol(row[3]);
  3467. if (idxArticle > maxIdx)
  3468. maxIdx = idxArticle;
  3469. sprintf(sRenderFile, "./dat/%ld/%ld/%ld/%s", (idxArticle / 1000000), (idxArticle / 10000) % 100, (idxArticle / 100) % 100, row[3]);
  3470. fdArticle = fopen(sRenderFile, "rb");
  3471. if (fdArticle)
  3472. {
  3473. nRenderedArticleLen = fread(sRendered, 1, MAX_RENDERED - 1, fdArticle);
  3474. sRendered[nRenderedArticleLen - 1] = '\0';
  3475. nRenderedLines = 0; // nRenderedLines is for information only. No needs to calculate it.
  3476. nArticleLinks = 0; // nArticleLinks is for information only. No needs to calculate it.
  3477. nExternalLinks = 0; // nExternalLinks is for information only. No needs to calculate it.
  3478. fclose(fdArticle);
  3479. }
  3480. else
  3481. {
  3482. fseeko64(fdXml, nArticleOffset, SEEK_SET);
  3483. nTextLen = fread(sTextBuf, 1, nTextLen, fdXml);
  3484. sTextBuf[nTextLen] = '\0';
  3485. showMsg(2, "before wikiRender\n");
  3486. nRenderedArticleLen = wikiRender(conn2, sRendered, &nRenderedLines, &nArticleLinks, &nExternalLinks,
  3487. sTitle, strlen(sTitle), sTextBuf, nTextLen);
  3488. showMsg(2, "after wikiRender\n");
  3489. if (nRenderedArticleLen > maxRenderedArticleLen)
  3490. maxRenderedArticleLen = nRenderedArticleLen;
  3491. if (nRenderedLines > maxRenderedLines)
  3492. maxRenderedLines = nRenderedLines;
  3493. if (nArticleLinks > maxArticleLinks)
  3494. maxArticleLinks = nArticleLinks;
  3495. fdArticle = fopen(sRenderFile, "wb");
  3496. if (!fdArticle)
  3497. {
  3498. showMsg(0, "cannot open file %s, error: %s\n", sRenderFile, strerror(errno));
  3499. exit(-1);
  3500. }
  3501. fwrite(sRendered, 1, nRenderedArticleLen, fdArticle);
  3502. fclose(fdArticle);
  3503. }
  3504. if (batch >= 0)
  3505. {
  3506. sRendered[nRenderedArticleLen] = '\0';
  3507. articlePtrs[idxArticle - idxBase].offset_dat = ftell(fdDat);
  3508. LzmaEncProps_Init(&props);
  3509. propsSize = LZMA_PROPS_SIZE;
  3510. nCompressedArticleLen = MAX_RENDERED;
  3511. rc = (int)LzmaEncode((Byte *)sRenderedCompressed + LZMA_PROPS_SIZE + 1, (SizeT *)&nCompressedArticleLen,
  3512. (const Byte *)sRendered, (SizeT)nRenderedArticleLen, &props, (Byte *)sRenderedCompressed + 1, &propsSize, 0, NULL,
  3513. &g_Alloc, &g_Alloc);
  3514. if (rc != SZ_OK)
  3515. {
  3516. showMsg(0, "LzmaEncode failed - %d\n", rc);
  3517. exit(-1);
  3518. }
  3519. sRenderedCompressed[0] = (char)propsSize;
  3520. nCompressedArticleLen += LZMA_PROPS_SIZE + 1;
  3521. if (nCompressedArticleLen > maxCompressedArticleLen)
  3522. maxCompressedArticleLen = nCompressedArticleLen;
  3523. articlePtrs[idxArticle - idxBase].file_id_compressed_len = batch;
  3524. articlePtrs[idxArticle - idxBase].file_id_compressed_len = (articlePtrs[idxArticle - idxBase].file_id_compressed_len << 24) |
  3525. (nCompressedArticleLen & 0x00FFFFFF);
  3526. articlePtrs[idxArticle - idxBase].file_id_compressed_len |= 0x80000000; // using lzma compression
  3527. fwrite(sRenderedCompressed, 1, nCompressedArticleLen, fdDat);
  3528. }
  3529. showMsg(2, "End processing entry title[%s]\n", sTitle);
  3530. }
  3531. mysql_free_result(res);
  3532. idxStart += RESULT_SET_LIMIT;
  3533. }
  3534. processing_speed(titlesProcessed);
  3535. mysql_commit(conn);
  3536. free(sTextBuf);
  3537. free(sRendered);
  3538. free(article_nodes);
  3539. if (batch >= 0)
  3540. {
  3541. nIdxCount = maxIdx - idxBase + 1;
  3542. fwrite((void*)&nIdxCount, 1, sizeof(nIdxCount), fdIdx);
  3543. fwrite((void*)articlePtrs, sizeof(ARTICLE_PTR), nIdxCount, fdIdx);
  3544. fclose(fdDat);
  3545. fclose(fdIdx);
  3546. free(articlePtrs);
  3547. free(sRenderedCompressed);
  3548. showMsg(0, "maxRenderedArticleLen: %d\nmaxRenderedLines: %ld\nmaxArticleLinks: %d\nmaxCompressedArticleLen: %d\n",
  3549. maxRenderedArticleLen, maxRenderedLines, maxArticleLinks, maxCompressedArticleLen);
  3550. }
  3551. else
  3552. showMsg(0, "maxRenderedArticleLen: %d\nmaxRenderedLines: %ld\nmaxArticleLinks: %d\n",
  3553. maxRenderedArticleLen, maxRenderedLines, maxArticleLinks);
  3554. }
  3555. extern char aBigram[128][2];
  3556. void get_bigram(MYSQL *conn)
  3557. {
  3558. char sSQL[MAX_SQL_STR];
  3559. MYSQL_RES *res;
  3560. MYSQL_ROW row;
  3561. int rc;
  3562. int i;
  3563. sprintf(sSQL, "select seq from bigram order by occurrences desc limit 128");
  3564. rc = mysql_query(conn, sSQL);
  3565. if (rc)
  3566. {
  3567. showMsg(0, "query entries error - %d (%s)\n", rc, mysql_error(conn));
  3568. exit(1);
  3569. }
  3570. strcpy(sSQL, "select bigram_chars from bigram where seq in (0");
  3571. res = mysql_use_result(conn);
  3572. while ((row = mysql_fetch_row(res)) != NULL)
  3573. {
  3574. strcat(sSQL, ",");
  3575. strcat(sSQL, row[0]);
  3576. }
  3577. strcat(sSQL, ") order by bigram_chars");
  3578. mysql_free_result(res);
  3579. rc = mysql_query(conn, sSQL);
  3580. if (rc)
  3581. {
  3582. showMsg(0, "query entries error - %d (%s)\n", rc, mysql_error(conn));
  3583. exit(1);
  3584. }
  3585. res = mysql_use_result(conn);
  3586. i = 0;
  3587. while (i < 128)
  3588. {
  3589. if ((row = mysql_fetch_row(res)) != NULL)
  3590. memcpy(aBigram[i++], row[0], 2);
  3591. else
  3592. memset(aBigram[i++], 0, 2);
  3593. }
  3594. mysql_free_result(res);
  3595. mysql_commit(conn);
  3596. }
  3597. void generate_pedia_idx(MYSQL *conn, long *nIdxCount, ARTICLE_PTR **articlePtrs)
  3598. {
  3599. FILE *fdIdx[MAX_DAT_FILES];
  3600. char idxFileName[13];
  3601. int i;
  3602. ARTICLE_PTR *p;
  3603. long nCount[MAX_DAT_FILES];
  3604. memset(nCount, 0, sizeof(nCount));
  3605. *nIdxCount = 0;
  3606. for (i = 0; i < MAX_DAT_FILES; i++)
  3607. {
  3608. sprintf(idxFileName, "pedia%d.idx", i);
  3609. if ((fdIdx[i] = fopen(idxFileName, "rb")))
  3610. {
  3611. fread(&nCount[i], 1, sizeof(long), fdIdx[i]);
  3612. *nIdxCount = *nIdxCount + nCount[i];
  3613. }
  3614. }
  3615. p = (ARTICLE_PTR *)malloc(sizeof(ARTICLE_PTR) * *nIdxCount);
  3616. if (!p)
  3617. {
  3618. showMsg(0, "malloc articlePtrs error\n");
  3619. exit(1);
  3620. }
  3621. *articlePtrs = p;
  3622. for (i = 0; i < MAX_DAT_FILES; i++)
  3623. {
  3624. if (fdIdx[i])
  3625. {
  3626. fread(p, sizeof(ARTICLE_PTR), nCount[i], fdIdx[i]);
  3627. p += nCount[i];
  3628. }
  3629. }
  3630. for (i = 0; i < MAX_DAT_FILES; i++)
  3631. if (fdIdx[i])
  3632. fclose(fdIdx[i]);
  3633. }
  3634. // pedia.idx format:
  3635. // The first 4 bytes contain the article count.
  3636. // Each article got a ARTICLE_PTR structure entry in pedia.idx.
  3637. // The first ARTICLE_PTR entry is for article idx 1.
  3638. //
  3639. // pedia.pfx format:
  3640. // first three character indexing table - 54 * 54 * 54 entries * 4 bytes (long int - file offset of pedia.fnd)
  3641. // 54 characters - null + 0~9 + a~z + ...
  3642. //
  3643. // pedia.fnd format:
  3644. // bigram table - 128 entries * 2 bytes
  3645. // All titles for search are sequentially concatnated into pedia.fnd in search order. Each entry consists of (see TITLE_SEARCH_REMAINDER):
  3646. // idx of article (pointing to pedia.idx)
  3647. // variable length and null terminated remainder (starting from the 3rd character)
  3648. #define SIZE_FIRST_THREE_CHAR_INDEXING SEARCH_CHR_COUNT * SEARCH_CHR_COUNT * SEARCH_CHR_COUNT * sizeof(long)
  3649. void generate_pedia_files(MYSQL *conn, int bSplitted)
  3650. {
  3651. FILE *fdPfx, *fdFnd, *fdIdx;
  3652. long offset_fnd;
  3653. int rc;
  3654. char sSQL[MAX_SQL_STR];
  3655. MYSQL_RES *res;
  3656. MYSQL_ROW row;
  3657. char sLastTitleSearch[MAX_TITLE_LEN];
  3658. long *firstThreeCharIndexing;
  3659. TITLE_SEARCH titleSearch;
  3660. int idxFirstThreeCharIndexing;
  3661. int lastIdxFirstThreeCharIndexing = -1;
  3662. int nEntryType;
  3663. char c1, c2, c3;
  3664. long lastIdxArticle;
  3665. ARTICLE_PTR *articlePtrs;
  3666. long nIdxCount;
  3667. char t1, t2, t3, sStart[4], sEnd[4];
  3668. long nTitlesProcessed = 0;
  3669. int i, j;
  3670. fdIdx = fopen("pedia.idx", "wb");
  3671. if (!fdIdx)
  3672. {
  3673. showMsg(0, "cannot open file pedia.idx, error: %s\n", strerror(errno));
  3674. exit(-1);
  3675. }
  3676. fdPfx = fopen("pedia.pfx", "wb");
  3677. if (!fdPfx)
  3678. {
  3679. showMsg(0, "cannot open file pedia.pfx, error: %s\n", strerror(errno));
  3680. exit(-1);
  3681. }
  3682. fdFnd = fopen("pedia.fnd", "wb");
  3683. if (!fdFnd)
  3684. {
  3685. showMsg(0, "cannot open file pedia.fnd, error: %s\n", strerror(errno));
  3686. exit(-1);
  3687. }
  3688. create_search_hash("pedial.hsh");
  3689. rc = mysql_query(conn, "select idx "
  3690. "from entries where idx is not null order by idx desc limit 1");
  3691. if (rc)
  3692. {
  3693. showMsg(0, "query entries idx error - %d (%s)\n", rc, mysql_error(conn));
  3694. exit(1);
  3695. }
  3696. res = mysql_use_result(conn);
  3697. if ((row = mysql_fetch_row(res)) != NULL)
  3698. nIdxCount = atol(row[0]);
  3699. else
  3700. {
  3701. showMsg(0, "no idx found\n");
  3702. exit(1);
  3703. }
  3704. mysql_free_result(res);
  3705. generate_pedia_idx(conn, &nIdxCount, &articlePtrs);
  3706. firstThreeCharIndexing = (long *)malloc(SIZE_FIRST_THREE_CHAR_INDEXING);
  3707. fwrite(&aBigram[0][0], 1, SIZE_BIGRAM_BUF, fdFnd);
  3708. sLastTitleSearch[0] = '\0';
  3709. lastIdxArticle = 0;
  3710. memset((void*)firstThreeCharIndexing, 0, SIZE_FIRST_THREE_CHAR_INDEXING);
  3711. sStart[3] = '\0';
  3712. sEnd[3] = '\0';
  3713. for (t1 = '0'; t1 <= 'Z'; t1++)
  3714. for (t2 = '0'; t2 <= 'Z'; t2++)
  3715. for (t3 = '0'; t3 <= 'Z'; t3++)
  3716. {
  3717. if (bSplitted)
  3718. {
  3719. if (t1 == '0' && t2 == '0' && t3 == '0')
  3720. {
  3721. sEnd[0] = t1;
  3722. sEnd[1] = t2;
  3723. sEnd[2] = t3;
  3724. sprintf(sSQL,
  3725. "select e1.idx, e1.entry_type, e1.title_search, e2.idx "
  3726. "from entries e1 "
  3727. "left outer join entries e2 on (e1.entry_type = 1 && e2.title = e1.redirect_title) "
  3728. "where (e1.entry_type=0 or e1.entry_type=1) and "
  3729. " e1.title_search < '000'"
  3730. "order by e1.title_search, e1.entry_type, e1.idx, e2.idx");
  3731. }
  3732. else if (t1 == 'Z' && t2 == 'Z' && t3 == 'Z')
  3733. {
  3734. strcpy(sEnd, sStart);
  3735. sprintf(sSQL,
  3736. "select e1.idx, e1.entry_type, e1.title_search, e2.idx "
  3737. "from entries e1 "
  3738. "left outer join entries e2 on (e1.entry_type = 1 && e2.title = e1.redirect_title) "
  3739. "where (e1.entry_type=0 or e1.entry_type=1) and "
  3740. " 'ZZZ' <= e1.title_search "
  3741. "order by e1.title_search, e1.entry_type, e1.idx, e2.idx");
  3742. }
  3743. else
  3744. {
  3745. strcpy(sStart, sEnd);
  3746. sEnd[0] = t1;
  3747. sEnd[1] = t2;
  3748. sEnd[2] = t3;
  3749. sprintf(sSQL,
  3750. "select e1.idx, e1.entry_type, e1.title_search, e2.idx "
  3751. "from entries e1 "
  3752. "left outer join entries e2 on (e1.entry_type = 1 && e2.title = e1.redirect_title) "
  3753. "where (e1.entry_type=0 or e1.entry_type=1) and "
  3754. " '%s' <= e1.title_search and e1.title_search < '%s'"
  3755. "order by e1.title_search, e1.entry_type, e1.idx, e2.idx", sStart, sEnd);
  3756. }
  3757. }
  3758. else
  3759. {
  3760. if (t1 == '0' && t2 == '0' && t3 == '0')
  3761. {
  3762. sprintf(sSQL,
  3763. "select e1.idx, e1.entry_type, e1.title_search, e2.idx "
  3764. "from entries e1 "
  3765. "left outer join entries e2 on (e1.entry_type = 1 && e2.title = e1.redirect_title) "
  3766. "where (e1.entry_type=0 or e1.entry_type=1)"
  3767. "order by e1.title_search, e1.entry_type, e1.idx, e2.idx");
  3768. }
  3769. else
  3770. sSQL[0] = '\0';
  3771. }
  3772. if (sSQL[0])
  3773. {
  3774. rc = mysql_query(conn, sSQL);
  3775. if (rc)
  3776. {
  3777. showMsg(0, "query entries idx error - %d (%s)\n", rc, mysql_error(conn));
  3778. exit(1);
  3779. }
  3780. res = mysql_use_result(conn);
  3781. while ((row = mysql_fetch_row(res)) != NULL)
  3782. {
  3783. nTitlesProcessed++;
  3784. showMsg(3, "%ld [%s][%s][%s][%s]\n", nTitlesProcessed, row[0], row[1], row[2], row[3]);
  3785. nEntryType = atoi(row[1]); // entry_type
  3786. if (nEntryType == 1) // Redirected article
  3787. {
  3788. if (row[3])
  3789. titleSearch.idxArticle = atol(row[3]);
  3790. else
  3791. titleSearch.idxArticle = 0;
  3792. }
  3793. else
  3794. {
  3795. if (row[0])
  3796. titleSearch.idxArticle = atol(row[0]); // idx
  3797. else
  3798. titleSearch.idxArticle = 0;
  3799. }
  3800. if (!titleSearch.idxArticle ||
  3801. (strlen(sLastTitleSearch) == strlen(row[2]) &&
  3802. !search_string_cmp(row[2], sLastTitleSearch, strlen(sLastTitleSearch)) &&
  3803. titleSearch.idxArticle == lastIdxArticle))
  3804. {
  3805. continue; // skipping redundant title for search
  3806. }
  3807. lastIdxArticle = titleSearch.idxArticle;
  3808. switch (strlen(row[2])) // title_search
  3809. {
  3810. case 0:
  3811. c1 = '\0';
  3812. c2 = '\0';
  3813. c3 = '\0';
  3814. break;
  3815. case 1:
  3816. c1 = row[2][0];
  3817. c2 = '\0';
  3818. c3 = '\0';
  3819. break;
  3820. case 2:
  3821. c1 = row[2][0];
  3822. c2 = row[2][1];
  3823. c3 = '\0';
  3824. break;
  3825. default:
  3826. c1 = row[2][0];
  3827. c2 = row[2][1];
  3828. c3 = row[2][2];
  3829. break;
  3830. }
  3831. bigram_encode(titleSearch.sTitleSearch, row[2]);
  3832. idxFirstThreeCharIndexing = bigram_char_idx(c1) * SEARCH_CHR_COUNT * SEARCH_CHR_COUNT +
  3833. bigram_char_idx(c2) * SEARCH_CHR_COUNT + bigram_char_idx(c3);
  3834. offset_fnd = ftell(fdFnd);
  3835. if (idxFirstThreeCharIndexing != lastIdxFirstThreeCharIndexing)
  3836. {
  3837. if (c2 != '\0' && firstThreeCharIndexing[bigram_char_idx(c1) * SEARCH_CHR_COUNT * SEARCH_CHR_COUNT] == 0)
  3838. {
  3839. firstThreeCharIndexing[bigram_char_idx(c1) * SEARCH_CHR_COUNT * SEARCH_CHR_COUNT] = offset_fnd;
  3840. }
  3841. if (c3 != '\0' &&
  3842. firstThreeCharIndexing[bigram_char_idx(c1) * SEARCH_CHR_COUNT * SEARCH_CHR_COUNT +
  3843. bigram_char_idx(c2) * SEARCH_CHR_COUNT] == 0)
  3844. {
  3845. firstThreeCharIndexing[bigram_char_idx(c1) * SEARCH_CHR_COUNT * SEARCH_CHR_COUNT +
  3846. bigram_char_idx(c2) * SEARCH_CHR_COUNT] = offset_fnd;
  3847. }
  3848. firstThreeCharIndexing[idxFirstThreeCharIndexing] = offset_fnd;
  3849. lastIdxFirstThreeCharIndexing = idxFirstThreeCharIndexing;
  3850. }
  3851. if (nEntryType == 0 && titleSearch.idxArticle)
  3852. articlePtrs[titleSearch.idxArticle - 1].offset_fnd = offset_fnd;
  3853. titleSearch.cZero = '\0';
  3854. fwrite(&titleSearch, 1, sizeof(titleSearch.idxArticle) + sizeof(titleSearch.cZero) +
  3855. strlen(titleSearch.sTitleSearch) + 1, fdFnd);
  3856. }
  3857. mysql_free_result(res);
  3858. }
  3859. }
  3860. showMsg(0, "Titles processed: %ld\n", nTitlesProcessed);
  3861. fseek(fdPfx, 0, SEEK_SET);
  3862. fwrite((void*)firstThreeCharIndexing, 1, SIZE_FIRST_THREE_CHAR_INDEXING, fdPfx);
  3863. fwrite((void*)&nIdxCount, 1, sizeof(nIdxCount), fdIdx);
  3864. fwrite((void*)articlePtrs, sizeof(ARTICLE_PTR), nIdxCount, fdIdx);
  3865. free(firstThreeCharIndexing);
  3866. free(articlePtrs);
  3867. fclose(fdPfx);
  3868. fclose(fdFnd);
  3869. fclose(fdIdx);
  3870. }
  3871. int process_hash_sequential_search(char *sLocalTitleSearch, long offsetBufFnd,
  3872. int lenHashSequentialSearch, char *sHashSequentialSearch, long *countHashSequentialSearchForNextChar)
  3873. {
  3874. int lenTitleSearch = strlen(sLocalTitleSearch);
  3875. int lenSame;
  3876. int i;
  3877. int bHashAdded = 0;
  3878. for (lenSame = 0; lenSame < lenTitleSearch && lenSame < lenHashSequentialSearch; lenSame++)
  3879. {
  3880. if (tolower(sLocalTitleSearch[lenSame]) != sHashSequentialSearch[lenSame])
  3881. break;
  3882. countHashSequentialSearchForNextChar[lenSame]++;
  3883. }
  3884. for (i = lenSame; i < lenHashSequentialSearch; i++)
  3885. {
  3886. if (!bHashAdded && i >= MAX_SEARCH_STRING_ALL_HASHED_LEN && i < lenTitleSearch &&
  3887. countHashSequentialSearchForNextChar[i] >= SEARCH_HASH_SEQUENTIAL_SEARCH_THRESHOLD)
  3888. {
  3889. add_search_hash(sLocalTitleSearch, i + 1, offsetBufFnd);
  3890. bHashAdded = 1;
  3891. }
  3892. if (lenSame == MAX_SEARCH_STRING_ALL_HASHED_LEN)
  3893. countHashSequentialSearchForNextChar[i]++;
  3894. else
  3895. countHashSequentialSearchForNextChar[i] = 0;
  3896. }
  3897. if (lenTitleSearch > MAX_SEARCH_STRING_HASHED_LEN)
  3898. lenHashSequentialSearch = MAX_SEARCH_STRING_HASHED_LEN;
  3899. else
  3900. lenHashSequentialSearch = lenTitleSearch;
  3901. for (i = lenSame; i < lenHashSequentialSearch; i++)
  3902. {
  3903. sHashSequentialSearch[i] = tolower(sLocalTitleSearch[i]);
  3904. }
  3905. return lenHashSequentialSearch;
  3906. }
  3907. long build_hash_tree(char *sTitleSearch, long offsetBufFnd, char *bufFnd, long lenBufFnd)
  3908. {
  3909. int i;
  3910. int lenTitleSearch;
  3911. TITLE_SEARCH *pTitleSearch = (TITLE_SEARCH *)&bufFnd[offsetBufFnd];
  3912. int rc;
  3913. char *pSupportedChars = SUPPORTED_SEARCH_CHARS;
  3914. char c;
  3915. char sLocalTitleSearch[MAX_TITLE_LEN];
  3916. int lenHashSequentialSearch = 0;
  3917. char sHashSequentialSearch[MAX_SEARCH_STRING_HASHED_LEN];
  3918. long countHashSequentialSearchForNextChar[MAX_SEARCH_STRING_HASHED_LEN];
  3919. showMsg(3, "build_hash_tree [%s] %x\n", sTitleSearch, offsetBufFnd);
  3920. memset(countHashSequentialSearchForNextChar, 0, sizeof(countHashSequentialSearchForNextChar));
  3921. lenTitleSearch = strlen(sTitleSearch);
  3922. if (lenTitleSearch < MAX_SEARCH_STRING_ALL_HASHED_LEN)
  3923. {
  3924. for (i = 0; i < strlen(pSupportedChars); i++)
  3925. {
  3926. c = pSupportedChars[i];
  3927. if (c != ' ' || sTitleSearch[lenTitleSearch -1] != ' ') // no two continuous blanks
  3928. {
  3929. sTitleSearch[lenTitleSearch] = c;
  3930. sTitleSearch[lenTitleSearch + 1] = '\0';
  3931. bigram_decode(sLocalTitleSearch, pTitleSearch->sTitleSearch, MAX_TITLE_LEN);
  3932. while (offsetBufFnd < lenBufFnd &&
  3933. (rc = search_string_cmp(sLocalTitleSearch, sTitleSearch, strlen(sTitleSearch))) < 0)
  3934. {
  3935. lenHashSequentialSearch = process_hash_sequential_search(sLocalTitleSearch, offsetBufFnd,
  3936. lenHashSequentialSearch, sHashSequentialSearch, countHashSequentialSearchForNextChar);
  3937. offsetBufFnd += sizeof(pTitleSearch->idxArticle) + strlen(pTitleSearch->sTitleSearch) + 2;
  3938. pTitleSearch = (TITLE_SEARCH *)&bufFnd[offsetBufFnd];
  3939. bigram_decode(sLocalTitleSearch, pTitleSearch->sTitleSearch, MAX_TITLE_LEN);
  3940. }
  3941. if (offsetBufFnd < lenBufFnd && !rc)
  3942. {
  3943. add_search_hash(sTitleSearch, strlen(sTitleSearch), offsetBufFnd);
  3944. lenHashSequentialSearch = 0;
  3945. memset(countHashSequentialSearchForNextChar, 0, sizeof(countHashSequentialSearchForNextChar));
  3946. //offsetBufFnd += sizeof(pTitleSearch->idxArticle) + strlen(pTitleSearch->sTitleSearch) + 2;
  3947. if (offsetBufFnd < lenBufFnd)
  3948. {
  3949. offsetBufFnd = build_hash_tree(sTitleSearch, offsetBufFnd, bufFnd, lenBufFnd);
  3950. pTitleSearch = (TITLE_SEARCH *)&bufFnd[offsetBufFnd];
  3951. }
  3952. }
  3953. }
  3954. }
  3955. }
  3956. return offsetBufFnd;
  3957. }
  3958. void generate_pedia_hsh(void)
  3959. {
  3960. FILE *fdPfx, *fdFnd;
  3961. char sTitleSearch[MAX_TITLE_LEN];
  3962. long *firstThreeCharIndexing;
  3963. int idxFirstThreeCharIndexing;
  3964. char *bufFnd;
  3965. long lenBufFnd;
  3966. char *pSupportedChars = SUPPORTED_SEARCH_CHARS;
  3967. char c1, c2, c3;
  3968. int i, j, k;
  3969. long offsetBufFnd = 0;
  3970. fdPfx = fopen("pedia.pfx", "rb");
  3971. if (!fdPfx)
  3972. {
  3973. showMsg(0, "cannot open file pedia.pfx, error: %s\n", strerror(errno));
  3974. exit(-1);
  3975. }
  3976. fdFnd = fopen("pedia.fnd", "rb");
  3977. if (!fdFnd)
  3978. {
  3979. showMsg(0, "cannot open file pedia.fnd, error: %s\n", strerror(errno));
  3980. exit(-1);
  3981. }
  3982. init_bigram(fdFnd);
  3983. create_search_hash("pedia.hsh");
  3984. firstThreeCharIndexing = (long *)malloc(SIZE_FIRST_THREE_CHAR_INDEXING);
  3985. if (!firstThreeCharIndexing)
  3986. {
  3987. showMsg(0, "malloc firstThreeCharIndexing error\n");
  3988. exit(1);
  3989. }
  3990. fread((void*)firstThreeCharIndexing, 1, SIZE_FIRST_THREE_CHAR_INDEXING, fdPfx);
  3991. fseek(fdFnd, 0, SEEK_END);
  3992. lenBufFnd = ftell(fdFnd);
  3993. fseek(fdFnd, 0, SEEK_SET);
  3994. bufFnd = malloc(lenBufFnd);
  3995. if (!bufFnd)
  3996. {
  3997. showMsg(0, "malloc bufFnd error\n");
  3998. exit(1);
  3999. }
  4000. lenBufFnd = fread(bufFnd, 1, lenBufFnd, fdFnd);
  4001. for (i = 0; i < strlen(pSupportedChars); i++)
  4002. {
  4003. c1 = pSupportedChars[i];
  4004. if (c1 != ' ') // no initial blank
  4005. {
  4006. for (j = 0; j < strlen(pSupportedChars); j++)
  4007. {
  4008. c2 = pSupportedChars[j];
  4009. for (k = 0; k < strlen(pSupportedChars); k++)
  4010. {
  4011. c3 = pSupportedChars[k];
  4012. idxFirstThreeCharIndexing = bigram_char_idx(c1) * SEARCH_CHR_COUNT * SEARCH_CHR_COUNT +
  4013. bigram_char_idx(c2) * SEARCH_CHR_COUNT + bigram_char_idx(c3);
  4014. if (firstThreeCharIndexing[idxFirstThreeCharIndexing])
  4015. {
  4016. sTitleSearch[0] = c1;
  4017. sTitleSearch[1] = c2;
  4018. sTitleSearch[2] = c3;
  4019. sTitleSearch[3] = '\0';
  4020. if (!offsetBufFnd)
  4021. offsetBufFnd = firstThreeCharIndexing[idxFirstThreeCharIndexing];
  4022. offsetBufFnd = build_hash_tree(sTitleSearch, offsetBufFnd, bufFnd, lenBufFnd);
  4023. }
  4024. }
  4025. }
  4026. }
  4027. }
  4028. save_search_hash();
  4029. free(firstThreeCharIndexing);
  4030. free(bufFnd);
  4031. fclose(fdPfx);
  4032. fclose(fdFnd);
  4033. }
  4034. unsigned char article_buffer[FILE_BUFFER_SIZE];
  4035. unsigned char compressed_buffer[FILE_BUFFER_SIZE];
  4036. long compress_article(unsigned char *sRendered, long nRenderedArticleLen)
  4037. {
  4038. CLzmaEncProps props;
  4039. SizeT propsSize;
  4040. unsigned int nCompressedArticleLen;
  4041. int rc;
  4042. sRendered[nRenderedArticleLen] = '\0';
  4043. LzmaEncProps_Init(&props);
  4044. propsSize = LZMA_PROPS_SIZE;
  4045. nCompressedArticleLen = FILE_BUFFER_SIZE;
  4046. rc = (int)LzmaEncode((Byte *)compressed_buffer + LZMA_PROPS_SIZE + 1, (SizeT *)&nCompressedArticleLen,
  4047. (const Byte *)sRendered, (SizeT)nRenderedArticleLen, &props, (Byte *)compressed_buffer + 1, &propsSize, 0, NULL,
  4048. &g_Alloc, &g_Alloc);
  4049. if (rc != SZ_OK)
  4050. {
  4051. showMsg(0, "LzmaEncode failed - %d\n", rc);
  4052. exit(-1);
  4053. }
  4054. compressed_buffer[0] = (char)propsSize;
  4055. nCompressedArticleLen += LZMA_PROPS_SIZE + 1;
  4056. return nCompressedArticleLen;
  4057. }
  4058. long decompress_article(unsigned char *in, long dat_article_len)
  4059. {
  4060. unsigned int article_buffer_len = FILE_BUFFER_SIZE;
  4061. int rc = 0;
  4062. ELzmaStatus status;
  4063. Byte propsEncoded[LZMA_PROPS_SIZE];
  4064. unsigned int propsSize;
  4065. memcpy(compressed_buffer, in, dat_article_len);
  4066. compressed_buffer[dat_article_len] = '\0';
  4067. propsSize = (unsigned int)compressed_buffer[0];
  4068. memcpy(propsEncoded, compressed_buffer + 1, LZMA_PROPS_SIZE);
  4069. dat_article_len -= LZMA_PROPS_SIZE + 1;
  4070. rc = (int)LzmaDecode(article_buffer, &article_buffer_len, compressed_buffer + LZMA_PROPS_SIZE + 1, &dat_article_len,
  4071. propsEncoded, propsSize, LZMA_FINISH_ANY, &status, &g_Alloc);
  4072. if (rc == SZ_OK || rc == SZ_ERROR_INPUT_EOF) /* not sure why it generate SZ_ERROR_INPUT_EOF yet but result ok */
  4073. {
  4074. article_buffer[article_buffer_len] = '\0';
  4075. return article_buffer_len;
  4076. }
  4077. else
  4078. {
  4079. showMsg(0, "decompress error: %d\n", rc);
  4080. exit(-1);
  4081. }
  4082. }
  4083. #define ARTICLE_SIZE_THRESHOLD_FOR_COMPRESSION 102400
  4084. #define CONCATNATED_ARTICLE_SIZE_THRESHOLD_FOR_COMPRESSION 153600
  4085. long nTitleSearches = 0;
  4086. void convert_fnd(char *bufFnd, long lenBufFnd, TITLE_SEARCH *titleSearches)
  4087. {
  4088. long offsetBufFnd = 256; // skipping the bigram table
  4089. while (offsetBufFnd < lenBufFnd)
  4090. {
  4091. memcpy((void *)&titleSearches[nTitleSearches].idxArticle, (void *)&bufFnd[offsetBufFnd], 4);
  4092. titleSearches[nTitleSearches].cZero = '\0';
  4093. // use TITLE_SEARCH.idxArticle in bufFnd to point to the corresponding titleSearches entry
  4094. memcpy((void *)&bufFnd[offsetBufFnd], (void *)&nTitleSearches, sizeof(nTitleSearches));
  4095. offsetBufFnd += 5; // position to TITLE_SEARCH.sTitleSearch
  4096. memcpy(titleSearches[nTitleSearches].sTitleSearch, &bufFnd[offsetBufFnd], MAX_TITLE_SEARCH - 1);
  4097. titleSearches[nTitleSearches].sTitleSearch[MAX_TITLE_SEARCH - 1] = '\0';
  4098. strcpy(titleSearches[nTitleSearches].sTitleActual, titleSearches[nTitleSearches].sTitleSearch);
  4099. offsetBufFnd += strlen(&bufFnd[offsetBufFnd]) + 1;
  4100. nTitleSearches++;
  4101. }
  4102. }
  4103. void reorg_dat(ARTICLE_PTR *articlePtrs, long nIdxCount, char *bufFnd, TITLE_SEARCH *titleSearches)
  4104. {
  4105. int i, j;
  4106. FILE *fdDat[MAX_DAT_FILES];
  4107. FILE *fdOutDat[MAX_DAT_FILES];
  4108. unsigned char *articleBuf;
  4109. long idxTitleSearches;
  4110. long lenBufDat;
  4111. char sFileName[20];
  4112. unsigned char outBuf[FILE_BUFFER_SIZE];
  4113. long lenOutBuf = 0;
  4114. char nArticlesConcatnated = 0;
  4115. CONCAT_ARTICLE_INFO aConcatArticleBuf[MAX_ARTICLE_PER_COMPRESSION];
  4116. int dat_file_id, nLastDatFileId = -1;
  4117. long dat_article_len;
  4118. long offsetDat;
  4119. int len;
  4120. char sLastTitle[MAX_TITLE_SEARCH];
  4121. long double nCompressedCount = 0;
  4122. long double nTotalCompressed = 0;
  4123. long double nTotalUncompressed = 0;
  4124. long double nCompressedCountdOver = 0;
  4125. long double nTotalCompressedOver = 0;
  4126. long double nTotalUncompressedOver = 0;
  4127. long double nCompressedCountUnder = 0;
  4128. long double nTotalCompressedUnder = 0;
  4129. long double nTotalUncompressedUnder = 0;
  4130. articleBuf = (unsigned char *)malloc(MAX_RENDERED);
  4131. if (!articleBuf)
  4132. {
  4133. showMsg(0, "malloc articleBuf\n");
  4134. exit(1);
  4135. }
  4136. for (i = 0; i < MAX_DAT_FILES; i++)
  4137. {
  4138. fdDat[i] = NULL;
  4139. fdOutDat[i] = NULL;
  4140. }
  4141. sLastTitle[0] = '\0';
  4142. processing_speed(0);
  4143. for (i=0; i < nIdxCount; i++)
  4144. {
  4145. if (!(articlePtrs[i].file_id_compressed_len & 0x3FFFFFFF))
  4146. {
  4147. printf("articlePtrs[%d].file_id_compressed_len %x\n", i, articlePtrs[i].file_id_compressed_len);
  4148. continue;
  4149. }
  4150. dat_file_id = ((articlePtrs[i].file_id_compressed_len & 0x3FFFFFFF)>> 24);
  4151. if (dat_file_id >= MAX_DAT_FILES)
  4152. continue;
  4153. if (!fdDat[dat_file_id])
  4154. {
  4155. sprintf(sFileName, "pedia/wiki%d.dat", dat_file_id);
  4156. fdDat[dat_file_id] = fopen(sFileName, "rb");
  4157. if (!fdDat[dat_file_id])
  4158. continue;
  4159. sprintf(sFileName, "wiki%d.dat", dat_file_id);
  4160. fdOutDat[dat_file_id] = fopen(sFileName, "wb");
  4161. fseek(fdDat[dat_file_id], 0, SEEK_END);
  4162. lenBufDat = ftell(fdDat[dat_file_id]);
  4163. fseek(fdDat[dat_file_id], 0, SEEK_SET);
  4164. }
  4165. dat_article_len = articlePtrs[i].file_id_compressed_len & 0x00FFFFFF;
  4166. fseek(fdDat[dat_file_id], articlePtrs[i].offset_dat & 0x7FFFFFFF, SEEK_SET);
  4167. fread(articleBuf, dat_article_len, 1, fdDat[dat_file_id]);
  4168. dat_article_len = decompress_article(articleBuf, dat_article_len);
  4169. memcpy((void *)&idxTitleSearches, (void *)&bufFnd[articlePtrs[i].offset_fnd], sizeof(idxTitleSearches));
  4170. extract_title_from_article(article_buffer, titleSearches[idxTitleSearches].sTitleActual);
  4171. if (dat_file_id != nLastDatFileId)
  4172. {
  4173. printf("last title of dat %d [%s]\n", nLastDatFileId, sLastTitle);
  4174. printf("first title of dat %d [%s]\n", dat_file_id, titleSearches[idxTitleSearches].sTitleActual);
  4175. }
  4176. strcpy(sLastTitle, titleSearches[idxTitleSearches].sTitleActual);
  4177. if (nArticlesConcatnated > 0 && (nArticlesConcatnated >= MAX_ARTICLE_PER_COMPRESSION ||
  4178. dat_file_id != nLastDatFileId ||
  4179. (dat_article_len < ARTICLE_SIZE_THRESHOLD_FOR_COMPRESSION &&
  4180. lenOutBuf + dat_article_len > CONCATNATED_ARTICLE_SIZE_THRESHOLD_FOR_COMPRESSION)))
  4181. {
  4182. nTotalUncompressed += lenOutBuf;
  4183. nTotalUncompressedUnder += lenOutBuf;
  4184. offsetDat = ftell(fdOutDat[nLastDatFileId]);
  4185. lenOutBuf = compress_article(outBuf, lenOutBuf);
  4186. nTotalCompressed += lenOutBuf;
  4187. nTotalCompressedUnder += lenOutBuf;
  4188. nCompressedCountUnder += nArticlesConcatnated;
  4189. nCompressedCount++;
  4190. for (j = 0; j < nArticlesConcatnated; j++)
  4191. {
  4192. articlePtrs[aConcatArticleBuf[j].article_id - 1].offset_dat = offsetDat;
  4193. articlePtrs[aConcatArticleBuf[j].article_id - 1].file_id_compressed_len &= 0xFF000000;
  4194. articlePtrs[aConcatArticleBuf[j].article_id - 1].file_id_compressed_len |= lenOutBuf;
  4195. }
  4196. fwrite(&nArticlesConcatnated, sizeof(nArticlesConcatnated), 1, fdOutDat[dat_file_id]);
  4197. if (nArticlesConcatnated > 0)
  4198. fwrite(aConcatArticleBuf, sizeof(CONCAT_ARTICLE_INFO), nArticlesConcatnated, fdOutDat[dat_file_id]);
  4199. fwrite(&compressed_buffer, lenOutBuf, 1, fdOutDat[dat_file_id]);
  4200. lenOutBuf = 0;
  4201. nArticlesConcatnated = 0;
  4202. }
  4203. if (dat_article_len >= ARTICLE_SIZE_THRESHOLD_FOR_COMPRESSION)
  4204. {
  4205. char nLocalArticlesConcatnated = 1;
  4206. CONCAT_ARTICLE_INFO localConcatArticleBuf;
  4207. nTotalUncompressed += dat_article_len;
  4208. nTotalUncompressedOver += dat_article_len;
  4209. offsetDat = ftell(fdOutDat[nLastDatFileId]);
  4210. dat_article_len = compress_article(article_buffer, dat_article_len);
  4211. nTotalCompressed += dat_article_len;
  4212. nTotalCompressedOver += dat_article_len;
  4213. nCompressedCountdOver++;
  4214. nCompressedCount++;
  4215. localConcatArticleBuf.article_id = i + 1;
  4216. localConcatArticleBuf.offset_article = articlePtrs[aConcatArticleBuf[nArticlesConcatnated].article_id].offset_dat & 0x80000000;
  4217. localConcatArticleBuf.article_len = dat_article_len;
  4218. articlePtrs[i].offset_dat = offsetDat;
  4219. articlePtrs[i].file_id_compressed_len &= 0xFF000000;
  4220. articlePtrs[i].file_id_compressed_len |= dat_article_len;
  4221. fwrite(&nLocalArticlesConcatnated, sizeof(nLocalArticlesConcatnated), 1, fdOutDat[dat_file_id]);
  4222. fwrite(&localConcatArticleBuf, sizeof(CONCAT_ARTICLE_INFO), 1, fdOutDat[dat_file_id]);
  4223. fwrite(&compressed_buffer, dat_article_len, 1, fdOutDat[dat_file_id]);
  4224. }
  4225. else
  4226. {
  4227. aConcatArticleBuf[nArticlesConcatnated].article_id = i + 1;
  4228. aConcatArticleBuf[nArticlesConcatnated].offset_article = lenOutBuf |
  4229. articlePtrs[aConcatArticleBuf[nArticlesConcatnated].article_id].offset_dat & 0x80000000;
  4230. aConcatArticleBuf[nArticlesConcatnated].article_len = dat_article_len;
  4231. memcpy(&outBuf[lenOutBuf], article_buffer, dat_article_len);
  4232. lenOutBuf += dat_article_len;
  4233. nArticlesConcatnated++;
  4234. }
  4235. nLastDatFileId = dat_file_id;
  4236. if (i && !(i % 10000))
  4237. processing_speed(i);
  4238. }
  4239. processing_speed(i);
  4240. printf("last title of dat %d [%s]\n", nLastDatFileId, sLastTitle);
  4241. if (nArticlesConcatnated > 0)
  4242. {
  4243. nTotalUncompressed += lenOutBuf;
  4244. nTotalUncompressedUnder += lenOutBuf;
  4245. offsetDat = ftell(fdOutDat[nLastDatFileId]);
  4246. lenOutBuf = compress_article(outBuf, lenOutBuf);
  4247. nTotalCompressed += lenOutBuf;
  4248. nTotalCompressedUnder += lenOutBuf;
  4249. nCompressedCountUnder += nArticlesConcatnated;
  4250. nCompressedCount++;
  4251. for (j = 0; j < nArticlesConcatnated; j++)
  4252. {
  4253. articlePtrs[aConcatArticleBuf[j].article_id - 1].offset_dat = offsetDat;
  4254. articlePtrs[aConcatArticleBuf[j].article_id - 1].file_id_compressed_len &= 0xFF000000;
  4255. articlePtrs[aConcatArticleBuf[j].article_id - 1].file_id_compressed_len |= lenOutBuf;
  4256. }
  4257. fwrite(&nArticlesConcatnated, sizeof(nArticlesConcatnated), 1, fdOutDat[dat_file_id]);
  4258. if (nArticlesConcatnated > 0)
  4259. fwrite(aConcatArticleBuf, sizeof(CONCAT_ARTICLE_INFO), nArticlesConcatnated, fdOutDat[dat_file_id]);
  4260. fwrite(&compressed_buffer, lenOutBuf, 1, fdOutDat[dat_file_id]);
  4261. }
  4262. for (i = 0; i < MAX_DAT_FILES; i++)
  4263. {
  4264. if (fdDat[i])
  4265. fclose(fdDat[i]);
  4266. if (fdOutDat[i])
  4267. fclose(fdOutDat[i]);
  4268. }
  4269. printf("nCompressedCount = %Lf\n",nCompressedCount );
  4270. printf("nTotalCompressed = %Lf\n",nTotalCompressed );
  4271. printf("nTotalUncompressed = %Lf\n",nTotalUncompressed );
  4272. printf("nCompressedCountdOver = %Lf\n",nCompressedCountdOver );
  4273. printf("nTotalCompressedOver = %Lf\n",nTotalCompressedOver );
  4274. printf("nTotalUncompressedOver = %Lf\n",nTotalUncompressedOver );
  4275. printf("nCompressedCountUnder = %Lf\n",nCompressedCountUnder );
  4276. printf("nTotalCompressedUnder = %Lf\n",nTotalCompressedUnder );
  4277. printf("nTotalUncompressedUnder = %Lf\n",nTotalUncompressedUnder);
  4278. printf("nTotalCompressed/nTotalUncompressed = %Lf\n",nTotalCompressed/nTotalUncompressed);
  4279. printf("nTotalCompressedUnder/nTotalUncompressedUnder = %Lf\n",nTotalCompressedUnder/nTotalUncompressedUnder);
  4280. printf("nTotalCompressedOver/nTotalUncompressedOver = %Lf\n",nTotalCompressedOver/nTotalUncompressedOver);
  4281. }
  4282. void compress_fnd(TITLE_SEARCH *titleSearches, long nIdxCount, unsigned char *bufFnd, SEARCH_HASH_TABLE *search_hash_table, long nHashEntries, long *firstThreeCharIndexing)
  4283. {
  4284. char *aKeepingFullTitle;
  4285. long idxTitleSearches;
  4286. long i, j;
  4287. char sLastTitleSearch[MAX_TITLE_SEARCH];
  4288. char sLastTitleActual[MAX_TITLE_SEARCH];
  4289. aKeepingFullTitle = (char *)malloc(nTitleSearches);
  4290. if (!aKeepingFullTitle)
  4291. {
  4292. showMsg(0, "malloc aKeepingFullTitle error\n");
  4293. exit(-1);
  4294. }
  4295. memset(aKeepingFullTitle, 0, nIdxCount);
  4296. for (i = 0; i < SEARCH_CHR_COUNT * SEARCH_CHR_COUNT * SEARCH_CHR_COUNT; i++)
  4297. {
  4298. if (firstThreeCharIndexing[i])
  4299. {
  4300. memcpy((void *)&idxTitleSearches, (void *)&bufFnd[firstThreeCharIndexing[i]], sizeof(idxTitleSearches));
  4301. aKeepingFullTitle[idxTitleSearches ] = 1;
  4302. }
  4303. }
  4304. for (i = 0; i < nHashEntries; i++)
  4305. {
  4306. if (search_hash_table[i].offset_fnd)
  4307. {
  4308. memcpy((void *)&idxTitleSearches, (void *)&bufFnd[search_hash_table[i].offset_fnd], sizeof(idxTitleSearches));
  4309. aKeepingFullTitle[idxTitleSearches ] = 1;
  4310. }
  4311. }
  4312. sLastTitleSearch[0] = '\0';
  4313. sLastTitleActual[0] = '\0';
  4314. for (i=0; i < nTitleSearches; i++)
  4315. {
  4316. if (!aKeepingFullTitle[i])
  4317. {
  4318. j = 0;
  4319. while (j < 31 && sLastTitleSearch[j] && sLastTitleSearch[j] == titleSearches[i].sTitleSearch[j])
  4320. j++;
  4321. if (j > 1)
  4322. {
  4323. memcpy(&titleSearches[i].sTitleSearch[1], &titleSearches[i].sTitleSearch[j], strlen(titleSearches[i].sTitleSearch) - j);
  4324. titleSearches[i].sTitleSearch[strlen(titleSearches[i].sTitleSearch) - j] = '\0';
  4325. titleSearches[i].sTitleSearch[0] = j;
  4326. }
  4327. strcpy(sLastTitleSearch, titleSearches[i].sTitleSearch);
  4328. j = 0;
  4329. while (j < 31 && sLastTitleActual[j] && sLastTitleActual[j] == titleSearches[i].sTitleActual[j])
  4330. j++;
  4331. if (j > 1)
  4332. {
  4333. memcpy(&titleSearches[i].sTitleActual[1], &titleSearches[i].sTitleActual[j], strlen(titleSearches[i].sTitleActual) - j);
  4334. titleSearches[i].sTitleActual[strlen(titleSearches[i].sTitleActual) - j] = '\0';
  4335. titleSearches[i].sTitleActual[0] = j;
  4336. }
  4337. strcpy(sLastTitleActual, titleSearches[i].sTitleActual);
  4338. }
  4339. }
  4340. }
  4341. void save_fnd(TITLE_SEARCH *titleSearches, long nIdxCount)
  4342. {
  4343. int i;
  4344. FILE *fdOutFnd;
  4345. long offset;
  4346. int len;
  4347. char sTitle[MAX_TITLE_SEARCH];
  4348. fdOutFnd = fopen("wiki.fnd", "wb");
  4349. fwrite(&aBigram[0][0], 1, SIZE_BIGRAM_BUF, fdOutFnd);
  4350. fwrite(&nTitleSearches, sizeof(nTitleSearches), 1, fdOutFnd); // just to make sure the first titleSearches entry does not start at 0
  4351. offset = sizeof(nIdxCount) + SIZE_BIGRAM_BUF;
  4352. for (i=0; i < nTitleSearches; i++)
  4353. {
  4354. fwrite(&titleSearches[i], 1, 5, fdOutFnd);
  4355. titleSearches[i].idxArticle = offset; // use it to store the offset of fnd for idx and pfx
  4356. offset += 5;
  4357. bigram_encode(sTitle, titleSearches[i].sTitleSearch);
  4358. len = strlen(sTitle) + 1;
  4359. fwrite(sTitle, 1, len, fdOutFnd);
  4360. offset += len;
  4361. // Actual title cannot be bigram encoded since it can contain any UTF8 characters
  4362. len = strlen(titleSearches[i].sTitleActual) + 1;
  4363. fwrite(titleSearches[i].sTitleActual, 1, len, fdOutFnd);
  4364. offset += len;
  4365. }
  4366. fclose(fdOutFnd);
  4367. }
  4368. void reorg_pfx(long *firstThreeCharIndexing, unsigned char *bufFnd, TITLE_SEARCH *titleSearches)
  4369. {
  4370. int i;
  4371. long idxTitleSearches;
  4372. FILE *fdOutPfx;
  4373. fdOutPfx = fopen("wiki.pfx", "wb");
  4374. for (i=1; i < SEARCH_CHR_COUNT * SEARCH_CHR_COUNT * SEARCH_CHR_COUNT; i++)
  4375. {
  4376. if (firstThreeCharIndexing[i])
  4377. {
  4378. memcpy((void *)&idxTitleSearches, (void *)&bufFnd[firstThreeCharIndexing[i]], sizeof(idxTitleSearches));
  4379. firstThreeCharIndexing[i] = titleSearches[idxTitleSearches].idxArticle;
  4380. }
  4381. }
  4382. fwrite((void*)firstThreeCharIndexing, 1, SIZE_FIRST_THREE_CHAR_INDEXING, fdOutPfx);
  4383. fclose(fdOutPfx);
  4384. }
  4385. void reorg_idx(ARTICLE_PTR *articlePtrs, long nIdxCount, unsigned char *bufFnd, TITLE_SEARCH *titleSearches)
  4386. {
  4387. int i;
  4388. long idxTitleSearches;
  4389. FILE *fdOutIdx;
  4390. fdOutIdx = fopen("wiki.idx", "wb");
  4391. for (i=0; i < nIdxCount; i++)
  4392. {
  4393. memcpy((void *)&idxTitleSearches, (void *)&bufFnd[articlePtrs[i].offset_fnd], sizeof(idxTitleSearches));
  4394. articlePtrs[i].offset_fnd = titleSearches[idxTitleSearches].idxArticle;
  4395. }
  4396. fwrite((void*)&nIdxCount, 1, sizeof(nIdxCount), fdOutIdx);
  4397. fwrite((void*)articlePtrs, sizeof(ARTICLE_PTR), nIdxCount, fdOutIdx);
  4398. fclose(fdOutIdx);
  4399. }
  4400. void reorg_hsh(SEARCH_HASH_TABLE *search_hash_table, long nHashEntries, unsigned char *bufFnd, TITLE_SEARCH *titleSearches)
  4401. {
  4402. int i;
  4403. long idxTitleSearches;
  4404. FILE *fdOutHsh;
  4405. fdOutHsh = fopen("wiki.hsh", "wb");
  4406. for (i=0; i < nHashEntries; i++)
  4407. {
  4408. if (search_hash_table[i].offset_fnd)
  4409. {
  4410. memcpy((void *)&idxTitleSearches, (void *)&bufFnd[search_hash_table[i].offset_fnd], sizeof(idxTitleSearches));
  4411. search_hash_table[i].offset_fnd = titleSearches[idxTitleSearches ].idxArticle;
  4412. }
  4413. }
  4414. fwrite((void*)&nHashEntries, 1, sizeof(nHashEntries), fdOutHsh);
  4415. fwrite((void*)search_hash_table, sizeof(SEARCH_HASH_TABLE), nHashEntries, fdOutHsh);
  4416. fclose(fdOutHsh);
  4417. }
  4418. void reorg_pedia(void)
  4419. {
  4420. FILE *fdPfx, *fdFnd, *fdIdx, *fdHsh;
  4421. long *firstThreeCharIndexing;
  4422. unsigned char *bufFnd;
  4423. long lenBufFnd;
  4424. ARTICLE_PTR *articlePtrs;
  4425. TITLE_SEARCH *titleSearches;
  4426. long nIdxCount;
  4427. SEARCH_HASH_TABLE *search_hash_table;
  4428. long nHashEntries;
  4429. long nReadCount;
  4430. fdPfx = fopen("pedia/wiki.pfx", "rb");
  4431. if (!fdPfx)
  4432. {
  4433. showMsg(0, "cannot open file pedia.pfx, error: %s\n", strerror(errno));
  4434. exit(-1);
  4435. }
  4436. fdFnd = fopen("pedia/wiki.fnd", "rb");
  4437. if (!fdFnd)
  4438. {
  4439. showMsg(0, "cannot open file pedia.fnd, error: %s\n", strerror(errno));
  4440. exit(-1);
  4441. }
  4442. fdIdx = fopen("pedia/wiki.idx", "rb");
  4443. if (!fdIdx)
  4444. {
  4445. printf("open pedia.idx error\n");
  4446. exit(-1);
  4447. }
  4448. fdHsh = fopen("pedia/wiki.hsh", "rb");
  4449. if (!fdHsh)
  4450. {
  4451. printf("open pedia/wiki.hsh error\n");
  4452. exit(-1);
  4453. }
  4454. init_bigram(fdFnd);
  4455. firstThreeCharIndexing = (long *)malloc(SIZE_FIRST_THREE_CHAR_INDEXING);
  4456. if (!firstThreeCharIndexing)
  4457. {
  4458. showMsg(0, "malloc firstThreeCharIndexing error\n");
  4459. exit(-1);
  4460. }
  4461. fread((void*)firstThreeCharIndexing, 1, SIZE_FIRST_THREE_CHAR_INDEXING, fdPfx);
  4462. fseek(fdFnd, 0, SEEK_END);
  4463. lenBufFnd = ftell(fdFnd);
  4464. fseek(fdFnd, 0, SEEK_SET);
  4465. bufFnd = malloc(lenBufFnd);
  4466. if (!bufFnd)
  4467. {
  4468. showMsg(0, "malloc bufFnd error\n");
  4469. exit(-1);
  4470. }
  4471. lenBufFnd = fread(bufFnd, 1, lenBufFnd, fdFnd);
  4472. fread((void*)&nIdxCount, 1, sizeof(nIdxCount), fdIdx);
  4473. articlePtrs = (ARTICLE_PTR *)malloc(sizeof(ARTICLE_PTR) * nIdxCount);
  4474. if (!articlePtrs)
  4475. {
  4476. showMsg(0, "malloc articlePtrs error\n");
  4477. exit(-1);
  4478. }
  4479. nReadCount = fread((void*)articlePtrs, sizeof(ARTICLE_PTR), nIdxCount, fdIdx);
  4480. printf("nIdxCount %ld, %ld, %ld\n", nIdxCount, sizeof(TITLE_SEARCH) * nIdxCount * 3, nReadCount);
  4481. titleSearches = (TITLE_SEARCH *)malloc(sizeof(TITLE_SEARCH) * nIdxCount * 3);
  4482. if (!titleSearches)
  4483. {
  4484. showMsg(0, "malloc titleSearches error\n");
  4485. exit(-1);
  4486. }
  4487. fread(&nHashEntries, sizeof(nHashEntries), 1, fdHsh);
  4488. search_hash_table = (SEARCH_HASH_TABLE *)malloc(sizeof(SEARCH_HASH_TABLE) * nHashEntries);
  4489. if (!search_hash_table)
  4490. {
  4491. showMsg(0, "malloc search_hash_table error\n");
  4492. exit(-1);
  4493. }
  4494. fread((void *)search_hash_table, sizeof(SEARCH_HASH_TABLE), nHashEntries, fdHsh);
  4495. fclose(fdPfx);
  4496. fclose(fdFnd);
  4497. fclose(fdIdx);
  4498. fclose(fdHsh);
  4499. printf("convert_fnd\n");
  4500. convert_fnd(bufFnd, lenBufFnd, titleSearches);
  4501. printf("reorg_dat\n");
  4502. reorg_dat(articlePtrs, nIdxCount, bufFnd, titleSearches);
  4503. printf("compress_fnd\n");
  4504. compress_fnd(titleSearches, nIdxCount, bufFnd, search_hash_table, nHashEntries, firstThreeCharIndexing);
  4505. printf("save_fnd\n");
  4506. save_fnd(titleSearches, nIdxCount);
  4507. printf("reorg_pfx\n");
  4508. reorg_pfx(firstThreeCharIndexing, bufFnd, titleSearches);
  4509. printf("reorg_idx\n");
  4510. reorg_idx(articlePtrs, nIdxCount, bufFnd, titleSearches);
  4511. printf("reorg_hsh\n");
  4512. reorg_hsh(search_hash_table, nHashEntries, bufFnd, titleSearches);
  4513. free(firstThreeCharIndexing);
  4514. free(articlePtrs);
  4515. free(bufFnd);
  4516. free(titleSearches);
  4517. free(search_hash_table);
  4518. }
  4519. void process_pass_3(MYSQL *conn, MYSQL *conn2, int bSplitted)
  4520. {
  4521. init_char_idx();
  4522. get_bigram(conn);
  4523. generate_pedia_files(conn, bSplitted);
  4524. }