1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207120812091210121112121213121412151216121712181219122012211222122312241225122612271228122912301231123212331234123512361237123812391240124112421243124412451246124712481249125012511252125312541255125612571258125912601261126212631264126512661267126812691270127112721273127412751276127712781279128012811282128312841285128612871288128912901291129212931294129512961297129812991300130113021303130413051306130713081309131013111312131313141315131613171318131913201321132213231324132513261327132813291330133113321333133413351336133713381339134013411342134313441345134613471348134913501351135213531354135513561357135813591360136113621363136413651366136713681369137013711372137313741375137613771378137913801381138213831384138513861387138813891390139113921393139413951396139713981399140014011402140314041405140614071408140914101411141214131414141514161417141814191420142114221423142414251426142714281429143014311432143314341435143614371438143914401441144214431444144514461447144814491450145114521453145414551456145714581459146014611462146314641465146614671468146914701471147214731474147514761477147814791480148114821483148414851486148714881489149014911492149314941495149614971498149915001501150215031504150515061507150815091510151115121513151415151516151715181519152015211522152315241525152615271528152915301531153215331534153515361537153815391540154115421543154415451546154715481549155015511552155315541555155615571558155915601561156215631564156515661567156815691570157115721573157415751576157715781579158015811582158315841585158615871588158915901591159215931594159515961597159815991600160116021603160416051606160716081609161016111612161316141615161616171618161916201621162216231624162516261627162816291630163116321633163416351636163716381639164016411642164316441645164616471648164916501651165216531654165516561657165816591660166116621663166416651666166716681669167016711672167316741675167616771678167916801681168216831684168516861687168816891690169116921693169416951696169716981699170017011702170317041705170617071708170917101711171217131714171517161717171817191720172117221723172417251726172717281729173017311732173317341735173617371738173917401741174217431744174517461747174817491750175117521753175417551756175717581759176017611762176317641765176617671768176917701771177217731774177517761777177817791780178117821783178417851786178717881789179017911792179317941795179617971798179918001801180218031804180518061807180818091810181118121813181418151816181718181819182018211822182318241825182618271828182918301831183218331834183518361837183818391840184118421843184418451846184718481849185018511852185318541855185618571858185918601861186218631864186518661867186818691870187118721873187418751876187718781879188018811882188318841885188618871888188918901891189218931894189518961897189818991900190119021903190419051906190719081909191019111912191319141915191619171918191919201921192219231924192519261927192819291930193119321933193419351936193719381939194019411942194319441945194619471948194919501951195219531954195519561957195819591960196119621963196419651966196719681969197019711972197319741975197619771978197919801981198219831984198519861987198819891990199119921993199419951996199719981999200020012002200320042005200620072008200920102011201220132014201520162017201820192020202120222023202420252026202720282029203020312032203320342035203620372038203920402041204220432044204520462047204820492050205120522053205420552056205720582059206020612062206320642065206620672068206920702071207220732074207520762077207820792080208120822083208420852086208720882089209020912092209320942095209620972098209921002101210221032104210521062107210821092110211121122113211421152116211721182119212021212122212321242125212621272128212921302131213221332134213521362137213821392140214121422143214421452146214721482149215021512152215321542155215621572158215921602161216221632164216521662167216821692170217121722173217421752176217721782179218021812182218321842185218621872188218921902191219221932194219521962197219821992200220122022203220422052206220722082209221022112212221322142215221622172218221922202221222222232224222522262227222822292230223122322233223422352236223722382239224022412242224322442245224622472248224922502251225222532254225522562257225822592260226122622263226422652266226722682269227022712272227322742275227622772278227922802281228222832284228522862287228822892290229122922293229422952296229722982299230023012302230323042305230623072308230923102311231223132314231523162317231823192320232123222323232423252326232723282329233023312332233323342335233623372338233923402341234223432344234523462347234823492350235123522353235423552356235723582359236023612362236323642365236623672368236923702371237223732374237523762377237823792380238123822383238423852386238723882389239023912392239323942395239623972398239924002401240224032404240524062407240824092410241124122413241424152416241724182419242024212422242324242425242624272428242924302431243224332434243524362437243824392440244124422443244424452446244724482449245024512452245324542455245624572458245924602461246224632464246524662467246824692470247124722473247424752476247724782479248024812482248324842485248624872488248924902491249224932494249524962497249824992500250125022503250425052506250725082509251025112512251325142515251625172518251925202521252225232524252525262527252825292530253125322533253425352536253725382539254025412542254325442545254625472548254925502551255225532554255525562557255825592560256125622563256425652566256725682569257025712572257325742575257625772578257925802581258225832584258525862587258825892590259125922593259425952596259725982599260026012602260326042605260626072608260926102611261226132614261526162617261826192620262126222623262426252626262726282629263026312632263326342635263626372638263926402641264226432644264526462647264826492650265126522653265426552656265726582659266026612662266326642665266626672668266926702671267226732674267526762677267826792680268126822683268426852686268726882689269026912692269326942695269626972698269927002701270227032704270527062707270827092710271127122713271427152716271727182719272027212722272327242725272627272728272927302731273227332734273527362737273827392740274127422743274427452746274727482749275027512752275327542755275627572758275927602761276227632764276527662767276827692770277127722773277427752776277727782779278027812782278327842785278627872788278927902791279227932794279527962797279827992800280128022803280428052806280728082809281028112812281328142815281628172818281928202821282228232824282528262827282828292830283128322833283428352836283728382839284028412842284328442845284628472848284928502851285228532854285528562857285828592860286128622863286428652866286728682869287028712872287328742875287628772878287928802881288228832884288528862887288828892890289128922893289428952896289728982899290029012902290329042905290629072908290929102911291229132914291529162917291829192920292129222923292429252926292729282929293029312932293329342935293629372938293929402941294229432944294529462947294829492950295129522953295429552956295729582959296029612962296329642965296629672968296929702971297229732974297529762977297829792980298129822983298429852986298729882989299029912992299329942995299629972998299930003001300230033004300530063007300830093010301130123013301430153016301730183019302030213022302330243025302630273028302930303031303230333034303530363037303830393040304130423043304430453046304730483049305030513052305330543055305630573058305930603061306230633064306530663067306830693070307130723073307430753076307730783079308030813082308330843085308630873088308930903091309230933094309530963097309830993100310131023103310431053106310731083109311031113112311331143115311631173118311931203121312231233124312531263127312831293130313131323133313431353136313731383139314031413142314331443145314631473148314931503151315231533154315531563157315831593160316131623163316431653166316731683169317031713172317331743175317631773178317931803181318231833184318531863187318831893190319131923193319431953196319731983199320032013202320332043205320632073208320932103211321232133214321532163217321832193220322132223223322432253226322732283229323032313232323332343235323632373238323932403241324232433244324532463247324832493250325132523253325432553256325732583259326032613262326332643265326632673268326932703271327232733274327532763277327832793280328132823283328432853286328732883289329032913292329332943295329632973298329933003301330233033304330533063307330833093310331133123313331433153316331733183319332033213322332333243325332633273328332933303331333233333334333533363337333833393340334133423343334433453346334733483349335033513352335333543355335633573358335933603361336233633364336533663367336833693370337133723373337433753376337733783379338033813382338333843385338633873388338933903391339233933394339533963397339833993400340134023403340434053406340734083409341034113412341334143415341634173418341934203421342234233424342534263427342834293430343134323433343434353436343734383439344034413442344334443445344634473448344934503451345234533454345534563457345834593460346134623463346434653466346734683469347034713472347334743475347634773478347934803481348234833484348534863487348834893490349134923493349434953496349734983499350035013502350335043505350635073508350935103511351235133514351535163517351835193520352135223523352435253526352735283529353035313532353335343535353635373538353935403541354235433544354535463547354835493550355135523553355435553556355735583559356035613562356335643565356635673568356935703571357235733574357535763577357835793580358135823583358435853586358735883589359035913592359335943595359635973598359936003601360236033604360536063607360836093610361136123613361436153616361736183619362036213622362336243625362636273628362936303631363236333634363536363637363836393640364136423643364436453646364736483649365036513652365336543655365636573658365936603661366236633664366536663667366836693670367136723673367436753676367736783679368036813682368336843685368636873688368936903691369236933694369536963697369836993700370137023703370437053706370737083709371037113712371337143715371637173718371937203721372237233724372537263727372837293730373137323733373437353736373737383739374037413742374337443745374637473748374937503751375237533754375537563757375837593760376137623763376437653766376737683769377037713772377337743775377637773778377937803781378237833784378537863787378837893790379137923793379437953796379737983799380038013802380338043805380638073808380938103811381238133814381538163817381838193820382138223823382438253826382738283829383038313832383338343835383638373838383938403841384238433844384538463847384838493850385138523853385438553856385738583859 |
- /*------------------------------------------------------------------------*
- * static const char CVSID[] = "$Id: regularExp.c,v 1.30 2006/08/13 21:47:45 yooden Exp $";
- * `CompileRE', `ExecRE', and `substituteRE' -- regular expression parsing
- *
- * This is a HIGHLY ALTERED VERSION of Henry Spencer's `regcomp' and
- * `regexec' code adapted for NEdit.
- *
- * .-------------------------------------------------------------------.
- * | ORIGINAL COPYRIGHT NOTICE: |
- * | |
- * | Copyright (c) 1986 by University of Toronto. |
- * | Written by Henry Spencer. Not derived from licensed software. |
- * | |
- * | Permission is granted to anyone to use this software for any |
- * | purpose on any computer system, and to redistribute it freely, |
- * | subject to the following restrictions: |
- * | |
- * | 1. The author is not responsible for the consequences of use of |
- * | this software, no matter how awful, even if they arise |
- * | from defects in it. |
- * | |
- * | 2. The origin of this software must not be misrepresented, either |
- * | by explicit claim or by omission. |
- * | |
- * | 3. Altered versions should be plainly marked as such, and must not |
- * | be misrepresented as being the original software. |
- * `-------------------------------------------------------------------'
- *
- * This is free software; you can redistribute it and/or modify it under the
- * terms of the GNU General Public License as published by the Free Software
- * Foundation; either version 2 of the License, or (at your option) any later
- * version. In addition, you may distribute version of this program linked to
- * Motif or Open Motif. See README for details.
- *
- * This software is distributed in the hope that it will be useful, but WITHOUT
- * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
- * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
- * for more details.
- *
- * You should have received a copy of the GNU General Public License along with
- * software; if not, write to the Free Software Foundation, Inc., 59 Temple
- * Place, Suite 330, Boston, MA 02111-1307 USA
- *
- *
- * BEWARE that some of this code is subtly aware of the way operator
- * precedence is structured in regular expressions. Serious changes in
- * regular-expression syntax might require a total rethink.
- * -- Henry Spencer
- * (Yes, it did!) -- Christopher Conrad, Dec. 1999
- *
- * January, 1994, Mark Edel
- * Consolidated files, changed names of external functions to avoid
- * potential conflicts with native regcomp and regexec functions, changed
- * error reporting to NEdit form, added multi-line and reverse searching,
- * and added \n \t \u \U \l \L.
- *
- * June, 1996, Mark Edel
- * Bug in NEXT macro, didn't work for expressions which compiled to over
- * 256 bytes.
- *
- * December, 1999, Christopher Conrad
- * Reformatted code for readability, improved error output, added octal and
- * hexadecimal escapes, added back-references (\1-\9), added positive look
- * ahead: (?=...), added negative lookahead: (?!...), added non-capturing
- * parentheses: (?:...), added case insensitive constructs (?i...) and
- * (?I...), added newline matching constructs (?n...) and (?N...), added
- * regex comments: (?#...), added shortcut escapes: \d\D\l\L\s\S\w\W\y\Y.
- * Added "not a word boundary" anchor \B.
- *
- * July, 2002, Eddy De Greef
- * Added look behind, both positive (?<=...) and negative (?<!...) for
- * bounded-length patterns.
- *
- * November, 2004, Eddy De Greef
- * Added constrained matching (allowing specification of the logical end
- * of the string iso. matching till \0), and fixed several (probably
- * very old) string overrun errors that could easily result in crashes,
- * especially in client code.
- *
- * June, 2008, David Weenink
- * Adaptation for use in the Praat program.
- * Changed reg_error to call Melder_error.
- * Extra parameter for SubstituteRE to allow error differentiation
- * (not enough memory can be repaired upstream).
- *
- * April, 2018, Paul Boersma
- * Added Unicode-based definitions of letters, digits, spaces and word boundaries.
- * Thereby removed shortcut escapes from classes.
- */
- #include <ctype.h>
- #include <limits.h>
- #include "melder.h"
- /* The first byte of the regexp internal `program' is a magic number to help
- guard against corrupted data; the compiled regex code really begins in the
- second byte. */
- #define MAGIC 0234
- /* The "internal use only" fields in `regexp.h' are present to pass info from
- * `CompileRE' to `ExecRE' which permits the execute phase to run lots faster on
- * simple cases. They are:
- *
- * match_start Character that must begin a match; '\0' if none obvious.
- * anchor Is the match anchored (at beginning-of-line only)?
- *
- * `match_start' and `anchor' permit very fast decisions on suitable starting
- * points for a match, considerably reducing the work done by ExecRE. */
- /* STRUCTURE FOR A REGULAR EXPRESSION (regex) `PROGRAM'.
- *
- * This is essentially a linear encoding of a nondeterministic finite-state
- * machine or NFA (aka syntax charts or `railroad normal form' in parsing
- * technology). Each node is an opcode plus a NEXT pointer, possibly
- * followed by operands. NEXT pointers of all nodes except BRANCH implement
- * concatenation; a NEXT pointer with a BRANCH on both ends of it is
- * connecting two alternatives. (Here we have one of the subtle syntax
- * dependencies: an individual BRANCH (as opposed to a collection of them) is
- * never concatenated with anything because of operator precedence.) The
- * operand of some types of nodes is a literal string; for others, it is a node
- * leading into a sub-FSM. In particular, the operand of a BRANCH node is the
- * first node of the branch. (NB this is _NOT_ a tree structure: the tail of
- * the branch connects to the thing following the set of BRANCHes.)
- *
- * The opcodes are: */
- /* DEFINITION VALUE MEANING */
- #define END 1 /* End of program. */
- /* Zero width positional assertions. */
- #define BOL 2 /* Match position at beginning of line. */
- #define EOL 3 /* Match position at end of line. */
- #define BOWORD 4 /* Match "" representing word delimiter or BOL */
- #define EOWORD 5 /* Match "" representing word delimiter or EOL */
- #define NOT_BOUNDARY 6 /* Not word boundary (\B, opposite of < and >) */
- /* Op codes with null terminated string operands. */
- #define EXACTLY 7 /* Match this string. */
- #define SIMILAR 8 /* Match this case insensitive string */
- #define ANY_OF 9 /* Match any character in the set. */
- #define ANY_BUT 10 /* Match any character not in the set. */
- /* Op codes to match any character. */
- #define ANY 11 /* Match any one character (implements '.') */
- #define EVERY 12 /* Same as ANY but matches newline. */
- /* Shortcut escapes, \d, \D, \l, \L, \s, \S, \w, \W, \y, \Y. */
- #define DIGIT 13 /* Match any digit, i.e. [0123456789] */
- #define NOT_DIGIT 14 /* Match any non-digit, i.e. [^0123456789] */
- #define LETTER 15 /* Match any letter character [a-zA-Z] */
- #define NOT_LETTER 16 /* Match any non-letter character [^a-zA-Z] */
- #define SPACE 17 /* Match any whitespace character EXCEPT \n */
- #define SPACE_NL 18 /* Match any whitespace character INCLUDING \n */
- #define NOT_SPACE 19 /* Match any non-whitespace character */
- #define NOT_SPACE_NL 20 /* Same as NOT_SPACE but matches newline. */
- #define WORD_CHAR 21 /* Match any word character [a-zA-Z0-9_] */
- #define NOT_WORD_CHAR 22 /* Match any non-word character [^a-zA-Z0-9_] */
- #define IS_DELIM 23 /* Match any character that's a word delimiter */
- #define NOT_DELIM 24 /* Match any character NOT a word delimiter */
- /* Quantifier nodes. (Only applied to SIMPLE nodes. Quantifiers applied to non
- SIMPLE nodes or larger atoms are implemented using complex constructs.)*/
- #define STAR 25 /* Match this (simple) thing 0 or more times. */
- #define LAZY_STAR 26 /* Minimal matching STAR */
- #define QUESTION 27 /* Match this (simple) thing 0 or 1 times. */
- #define LAZY_QUESTION 28 /* Minimal matching QUESTION */
- #define PLUS 29 /* Match this (simple) thing 1 or more times. */
- #define LAZY_PLUS 30 /* Minimal matching PLUS */
- #define BRACE 31 /* Match this (simple) thing m to n times. */
- #define LAZY_BRACE 32 /* Minimal matching BRACE */
- /* Nodes used to build complex constructs. */
- #define NOTHING 33 /* Match empty string (always matches) */
- #define BRANCH 34 /* Match this alternative, or the next... */
- #define BACK 35 /* Always matches, NEXT ptr points backward. */
- #define INIT_COUNT 36 /* Initialize {m,n} counter to zero */
- #define INC_COUNT 37 /* Increment {m,n} counter by one */
- #define TEST_COUNT 38 /* Test {m,n} counter against operand */
- /* Back Reference nodes. */
- #define BACK_REF 39 /* Match latest matched parenthesized text */
- #define BACK_REF_CI 40 /* Case insensitive version of BACK_REF */
- #define X_REGEX_BR 41 /* Cross-Regex Back-Ref for syntax highlighting */
- #define X_REGEX_BR_CI 42 /* Case insensitive version of X_REGEX_BR_CI */
- /* Various nodes used to implement parenthetical constructs. */
- #define POS_AHEAD_OPEN 43 /* Begin positive look ahead */
- #define NEG_AHEAD_OPEN 44 /* Begin negative look ahead */
- #define LOOK_AHEAD_CLOSE 45 /* End positive or negative look ahead */
- #define POS_BEHIND_OPEN 46 /* Begin positive look behind */
- #define NEG_BEHIND_OPEN 47 /* Begin negative look behind */
- #define LOOK_BEHIND_CLOSE 48 /* Close look behind */
- #define OPEN 49 /* Open for capturing parentheses. */
- /* OPEN+1 is number 1, etc. */
- #define CLOSE (OPEN + NSUBEXP) /* Close for capturing parentheses. */
- #define LAST_PAREN (CLOSE + NSUBEXP)
- #if (LAST_PAREN > UCHAR_MAX)
- #error "Too many parentheses for storage in an unsigned char (LAST_PAREN too big.)"
- #endif
- /* The next_ptr () function can consume up to 30% of the time during matching
- because it is called an immense number of times (an average of 25
- next_ptr() calls per match() call was witnessed for Perl syntax
- highlighting). Therefore it is well worth removing some of the function
- call overhead by selectively inlining the next_ptr() calls. Moreover,
- the inlined code can be simplified for matching because one of the tests,
- only necessary during compilation, can be left out.
- The net result of using this inlined version at two critical places is
- a 25% speedup (again, witnesses on Perl syntax highlighting). */
- #define NEXT_PTR(in_ptr, out_ptr)\
- next_ptr_offset = GET_OFFSET (in_ptr);\
- if (next_ptr_offset == 0)\
- out_ptr = NULL;\
- else {\
- if (GET_OP_CODE (in_ptr) == BACK)\
- out_ptr = in_ptr - next_ptr_offset;\
- else \
- out_ptr = in_ptr + next_ptr_offset;\
- }
- /* OPCODE NOTES:
- ------------
- All nodes consist of an 8 bit op code followed by 2 bytes that make up a 16
- bit NEXT pointer. Some nodes have a null terminated character string operand
- following the NEXT pointer. Other nodes may have an 8 bit index operand.
- The TEST_COUNT node has an index operand followed by a 16 bit test value.
- The BRACE and LAZY_BRACE nodes have two 16 bit values for min and max but no
- index value.
- SIMILAR
- Operand(s): null terminated string
- Implements a case insensitive match of a string. Mostly intended for use
- in syntax highlighting patterns for keywords of languages like FORTRAN
- and Ada that are case insensitive. The regex text in this node is
- converted to lower case during regex compile.
- DIGIT, NOT_DIGIT, LETTER, NOT_LETTER, SPACE, NOT_SPACE, WORD_CHAR,
- NOT_WORD_CHAR
- Operand(s): None
- Implements shortcut escapes \d, \D, \l, \L, \s, \S, \w, \W.
- The Unicode-aware functions Melder_isDecimalNumber(), Melder_isLetter(),
- Melder_isHorizontalSpace(), and Melder_isWordCharacter are
- used to implement these in the hopes of increasing portability.
- NOT_BOUNDARY
- Operand(s): None
- Implements \B as a zero width assertion that the current character is
- NOT on a word boundary. Word boundaries are defined to be the position
- between two characters where one of those characters is
- a Unicode word character and the other character is not.
- IS_DELIM
- Operand(s): None
- Implements \y as any character that is a Unicode word delimiter.
- NOT_DELIM
- Operand(s): None
- Implements \Y as any character that is NOT a Unicode word delimiter.
- STAR, PLUS, QUESTION, and complex '*', '+', and '?'
- Operand(s): None (Note: NEXT pointer is usually zero. The code that
- processes this node skips over it.)
- Complex (parenthesized) versions implemented as circular BRANCH
- structures using BACK. SIMPLE versions (one character per match) are
- implemented separately for speed and to minimize recursion.
- BRACE, LAZY_BRACE
- Operand(s): minimum value (2 bytes), maximum value (2 bytes)
- Implements the {m,n} construct for atoms that are SIMPLE.
- BRANCH
- Operand(s): None
- The set of branches constituting a single choice are hooked together
- with their NEXT pointers, since precedence prevents anything being
- concatenated to any individual branch. The NEXT pointer of the last
- BRANCH in a choice points to the thing following the whole choice. This
- is also where the final NEXT pointer of each individual branch points;
- each branch starts with the operand node of a BRANCH node.
- BACK
- Operand(s): None
- Normal NEXT pointers all implicitly point forward. Back implicitly
- points backward. BACK exists to make loop structures possible.
- INIT_COUNT
- Operand(s): index (1 byte)
- Initializes the count array element referenced by the index operand.
- This node is used to build general (i.e. parenthesized) {m,n} constructs.
- INC_COUNT
- Operand(s): index (1 byte)
- Increments the count array element referenced by the index operand.
- This node is used to build general (i.e. parenthesized) {m,n} constructs.
- TEST_COUNT
- Operand(s): index (1 byte), test value (2 bytes)
- Tests the current value of the count array element specified by the
- index operand against the test value. If the current value is less than
- the test value, control passes to the node after that TEST_COUNT node.
- Otherwise control passes to the node referenced by the NEXT pointer for
- the TEST_COUNT node. This node is used to build general (i.e.
- parenthesized) {m,n} constructs.
- BACK_REF, BACK_REF_CI
- Operand(s): index (1 byte, value 1-9)
- Implements back references. This node will attempt to match whatever text
- was most recently captured by the index'th set of parentheses.
- BACK_REF_CI is case insensitive version.
- X_REGEX_BR, X_REGEX_BR_CI
- (NOT IMPLEMENTED YET)
- Operand(s): index (1 byte, value 1-9)
- Implements back references into a previously matched but separate regular
- expression. This is used by syntax highlighting patterns. This node will
- attempt to match whatever text was most captured by the index'th set of
- parentheses of the separate regex passed to ExecRE. X_REGEX_BR_CI is case
- insensitive version.
- POS_AHEAD_OPEN, NEG_AHEAD_OPEN, LOOK_AHEAD_CLOSE
- Operand(s): None
- Implements positive and negative look ahead. Look ahead is an assertion
- that something is either there or not there. Once this is determined the
- regex engine backtracks to where it was just before the look ahead was
- encountered, i.e. look ahead is a zero width assertion.
- POS_BEHIND_OPEN, NEG_BEHIND_OPEN, LOOK_BEHIND_CLOSE
- Operand(s): 2x2 bytes for OPEN (match boundaries), None for CLOSE
- Implements positive and negative look behind. Look behind is an assertion
- that something is either there or not there in front of the current
- position. Look behind is a zero width assertion, with the additional
- constraint that it must have a bounded length (for complexity and
- efficiency reasons; note that most other implementation even impose
- fixed length).
- OPEN, CLOSE
- Operand(s): None
- OPEN + n = Start of parenthesis 'n', CLOSE + n = Close of parenthesis
- 'n', and are numbered at compile time.
- */
- /* A node is one char of opcode followed by two chars of NEXT pointer plus
- * any operands. NEXT pointers are stored as two 8-bit pieces, high order
- * first. The value is a positive offset from the opcode of the node
- * containing it. An operand, if any, simply follows the node. (Note that
- * much of the code generation knows about this implicit relationship.)
- *
- * Using two bytes for NEXT_PTR_SIZE is vast overkill for most things,
- * but allows patterns to get big without disasters. */
- #define OP_CODE_SIZE 1
- #define NEXT_PTR_SIZE 2
- #define INDEX_SIZE 1
- #define LENGTH_SIZE 4
- #define NODE_SIZE (NEXT_PTR_SIZE + OP_CODE_SIZE)
- #define GET_OP_CODE(p) (*(char32 *)(p))
- #define OPERAND(p) ((p) + NODE_SIZE)
- #define GET_OFFSET(p) ((( *((p) + 1) & 0377) << 8) + (( *((p) + 2)) & 0377))
- #define PUT_OFFSET_L(v) (char32)(((v) >> 8) & 0377)
- #define PUT_OFFSET_R(v) (char32) ((v) & 0377)
- #define GET_LOWER(p) ((( *((p) + NODE_SIZE) & 0377) << 8) + \
- (( *((p) + NODE_SIZE+1)) & 0377))
- #define GET_UPPER(p) ((( *((p) + NODE_SIZE+2) & 0377) << 8) + \
- (( *((p) + NODE_SIZE+3)) & 0377))
- /* Utility definitions. */
- #define REG_FAIL(m) {*Error_Ptr = (m); return (NULL);}
- #define IS_QUANTIFIER(c) ((c) == '*' || (c) == '+' || \
- (c) == '?' || (c) == Brace_Char)
- #define SET_BIT(i,n) ((i) |= (1 << ((n) - 1)))
- #define TEST_BIT(i,n) ((i) & (1 << ((n) - 1)))
- #define U_CHAR_AT(p) ((unsigned int) *(char32 *)(p))
- /* Flags to be passed up and down via function parameters during compile. */
- #define WORST 0 /* Worst case. No assumptions can be made.*/
- #define HAS_WIDTH 1 /* Known never to match null string. */
- #define SIMPLE 2 /* Simple enough to be STAR/PLUS operand. */
- #define NO_PAREN 0 /* Only set by initial call to "chunk". */
- #define PAREN 1 /* Used for normal capturing parentheses. */
- #define NO_CAPTURE 2 /* Non-capturing parentheses (grouping only). */
- #define INSENSITIVE 3 /* Case insensitive parenthetical construct */
- #define SENSITIVE 4 /* Case sensitive parenthetical construct */
- #define NEWLINE 5 /* Construct to match newlines in most cases */
- #define NO_NEWLINE 6 /* Construct to match newlines normally */
- #define REG_INFINITY 0UL
- #define REG_ZERO 0UL
- #define REG_ONE 1UL
- /* Flags for function shortcut_escape() */
- #define CHECK_ESCAPE 0 /* Check an escape sequence for validity only. */
- #define CHECK_CLASS_ESCAPE 1 /* Check the validity of an escape within a character class */
- #define EMIT_NODE 2 /* Emit the appropriate node. */
- /* Number of bytes to offset from the beginning of the regex program to the start
- of the actual compiled regex code, i.e. skipping over the MAGIC number and
- the two counters at the front. */
- #define REGEX_START_OFFSET 3
- #define MAX_COMPILED_SIZE 32767UL /* Largest size a compiled regex can be.
- Probably could be 65535UL. */
- /* Global work variables for `CompileRE'. */
- static char32 *Reg_Parse; /* Input scan ptr (scans user's regex) */
- static int Total_Paren; /* Parentheses, (), counter. */
- static int Num_Braces; /* Number of general {m,n} constructs.
- {m,n} quantifiers of SIMPLE atoms are
- not included in this count. */
- static int Closed_Parens; /* Bit flags indicating () closure. */
- static int Paren_Has_Width; /* Bit flags indicating ()'s that are
- known to not match the empty string */
- static char32 Compute_Size; /* Address of this used as flag. */
- static char32 *Code_Emit_Ptr; /* When Code_Emit_Ptr is set to
- &Compute_Size no code is emitted.
- Instead, the size of code that WOULD
- have been generated is accumulated in
- Reg_Size. Otherwise, Code_Emit_Ptr
- points to where compiled regex code is
- to be written. */
- static unsigned long Reg_Size; /* Size of compiled regex code. */
- static const char32 **Error_Ptr; /* Place to store error messages so
- they can be returned by `CompileRE' */
- static char32 Error_Text [128];/* Sting to build error messages in. */
- static int Is_Case_Insensitive;
- static int Match_Newline;
- static int Enable_Counting_Quantifier = 1;
- static char32 Brace_Char;
- static char32 Default_Meta_Char [] = { '{', '.', '*', '+', '?', '[', '(', '|', ')', '^', '<', '>', '$', '\0' };
- static char32 *Meta_Char;
- typedef struct {
- long lower;
- long upper;
- } len_range;
- /* Forward declarations for functions used by `CompileRE'. */
- static char32 *alternative (int *flag_param, len_range *range_param);
- static char32 *back_ref (char32 *c, int *flag_param, int emit);
- static char32 *chunk (int paren, int *flag_param, len_range *range_param);
- static void emit_byte (char32 c);
- static void emit_class_byte (char32 c);
- static char32 *emit_node (int op_code);
- static char32 *emit_special (char32 op_code, unsigned long test_val, int index);
- static char32 literal_escape (char32 c);
- static char32 numeric_escape (char32 c, char32 **parse);
- static char32 *atom (int *flag_param, len_range *range_param);
- static void reg_error (const char32_t *str);
- static char32 *insert (char32 op, char32 *opnd, long min, long max, int index);
- static char32 *next_ptr (char32 *ptr);
- static void offset_tail (char32 *ptr, int offset, char32 *val);
- static void branch_tail (char32 *ptr, int offset, char32 *val);
- static char32 *piece (int *flag_param, len_range *range_param);
- static void tail (char32 *search_from, char32 *point_t);
- static char32 *shortcut_escape (char32 c, int *flag_param, int emit);
- /*----------------------------------------------------------------------*
- * CompileRE
- *
- * Compiles a regular expression into the internal format used by
- * `ExecRE'.
- *
- * The default behaviour wrt. case sensitivity and newline matching can
- * be controlled through the defaultFlags argument (Markus Schwarzenberg).
- * Future extensions are possible by using other flag bits.
- * Note that currently only the case sensitivity flag is effectively used.
- *
- * Beware that the optimization and preparation code in here knows about
- * some of the structure of the compiled regexp.
- *----------------------------------------------------------------------*/
- regexp *CompileRE_throwable (conststring32 exp, int defaultFlags) {
- conststring32 compileMessage;
- regexp *compiledRE = CompileRE (exp, & compileMessage, defaultFlags);
- if (compiledRE == NULL) {
- Melder_throw (U"Regular expression: ", compileMessage, U".");
- }
- return compiledRE;
- }
- regexp *CompileRE (conststring32 exp, conststring32 *errorText, int defaultFlags) {
- regexp *comp_regex = NULL;
- char32 *scan;
- int flags_local, pass;
- len_range range_local;
- if (Enable_Counting_Quantifier) {
- Brace_Char = '{';
- Meta_Char = &Default_Meta_Char [0];
- } else {
- Brace_Char = '*'; /* Bypass the '{' in */
- Meta_Char = &Default_Meta_Char [1]; /* Default_Meta_Char */
- }
- /* Set up errorText to receive failure reports. */
- Error_Ptr = errorText;
- *Error_Ptr = U"";
- if (exp == NULL) {
- REG_FAIL (U"NULL argument, `CompileRE\'");
- }
- Code_Emit_Ptr = &Compute_Size;
- Reg_Size = 0UL;
- /* We can't allocate space until we know how big the compiled form will be,
- but we can't compile it (and thus know how big it is) until we've got a
- place to put the code. So we cheat: we compile it twice, once with code
- generation turned off and size counting turned on, and once "for real".
- This also means that we don't allocate space until we are sure that the
- thing really will compile successfully, and we never have to move the
- code and thus invalidate pointers into it. (Note that it has to be in
- one piece because free() should be able to free it all.) */
- for (pass = 1; pass <= 2; pass++) {
- /*-------------------------------------------*
- * FIRST PASS: Determine size and legality. *
- * SECOND PASS: Emit code. *
- *-------------------------------------------*/
- /* Schwarzenberg:
- * If defaultFlags = 0 use standard defaults:
- * Is_Case_Insensitive: Case sensitive is the default
- * Match_Newline: Newlines are NOT matched by default
- * in character classes
- */
- Is_Case_Insensitive = ( (defaultFlags & REDFLT_CASE_INSENSITIVE) ? 1 : 0);
- Match_Newline = 0; /* ((defaultFlags & REDFLT_MATCH_NEWLINE) ? 1 : 0);
- Currently not used. Uncomment if needed. */
- Reg_Parse = (char32 *) exp;
- Total_Paren = 1;
- Num_Braces = 0;
- Closed_Parens = 0;
- Paren_Has_Width = 0;
- emit_byte (MAGIC);
- emit_byte ('%'); /* Placeholder for num of capturing parentheses. */
- emit_byte ('%'); /* Placeholder for num of general {m,n} constructs. */
- if (chunk (NO_PAREN, &flags_local, &range_local) == NULL) {
- return (NULL); /* Something went wrong */
- }
- if (pass == 1) {
- if (Reg_Size >= MAX_COMPILED_SIZE) {
- /* Too big for NEXT pointers NEXT_PTR_SIZE bytes long to span.
- This is a real issue since the first BRANCH node usually points
- to the end of the compiled regex code. */
- Melder_sprint (Error_Text,128, U"regexp > ", MAX_COMPILED_SIZE, U" bytes");
- REG_FAIL (Error_Text);
- }
- /* Allocate memory. */
- comp_regex = (regexp *) malloc (sizeof (regexp) + Reg_Size * sizeof (char32));
- if (comp_regex == NULL) {
- REG_FAIL (U"out of memory in `CompileRE\'");
- }
- Code_Emit_Ptr = (char32 *) comp_regex->program;
- }
- }
- comp_regex->program [1] = (char32) Total_Paren - 1;
- comp_regex->program [2] = (char32) Num_Braces;
- /*----------------------------------------*
- * Dig out information for optimizations. *
- *----------------------------------------*/
- comp_regex->match_start = '\0'; /* Worst-case defaults. */
- comp_regex->anchor = 0;
- /* First BRANCH. */
- scan = (char32 *) (comp_regex->program + REGEX_START_OFFSET);
- if (GET_OP_CODE (next_ptr (scan)) == END) { /* Only one top-level choice. */
- scan = OPERAND (scan);
- /* Starting-point info. */
- if (GET_OP_CODE (scan) == EXACTLY) {
- comp_regex->match_start = *OPERAND (scan);
- } else if (PLUS <= GET_OP_CODE (scan) &&
- GET_OP_CODE (scan) <= LAZY_PLUS) {
- /* Allow x+ or x+? at the start of the regex to be
- optimized. */
- if (GET_OP_CODE (scan + NODE_SIZE) == EXACTLY) {
- comp_regex->match_start = *OPERAND (scan + NODE_SIZE);
- }
- } else if (GET_OP_CODE (scan) == BOL) {
- comp_regex->anchor++;
- }
- }
- return (comp_regex);
- }
- /*----------------------------------------------------------------------*
- * chunk *
- * *
- * Process main body of regex or process a parenthesized "thing". *
- * *
- * Caller must absorb opening parenthesis. *
- * *
- * Combining parenthesis handling with the base level of regular *
- * expression is a trifle forced, but the need to tie the tails of the *
- * branches to what follows makes it hard to avoid. *
- *----------------------------------------------------------------------*/
- static char32 *chunk (int paren, int *flag_param, len_range *range_param) {
- char32 *ret_val = NULL;
- char32 *this_branch;
- char32 *ender = NULL;
- int this_paren = 0;
- int flags_local, first = 1, zero_width, i;
- int old_sensitive = Is_Case_Insensitive;
- int old_newline = Match_Newline;
- len_range range_local;
- int look_only = 0;
- char32 *emit_look_behind_bounds = NULL;
- *flag_param = HAS_WIDTH; /* Tentatively. */
- range_param->lower = 0; /* Idem */
- range_param->upper = 0;
- /* Make an OPEN node, if parenthesized. */
- if (paren == PAREN) {
- if (Total_Paren >= NSUBEXP) {
- Melder_sprint (Error_Text,128, U"number of ()'s > ", NSUBEXP);
- REG_FAIL (Error_Text)
- }
- this_paren = Total_Paren; Total_Paren++;
- ret_val = emit_node (OPEN + this_paren);
- } else if (paren == POS_AHEAD_OPEN || paren == NEG_AHEAD_OPEN) {
- *flag_param = WORST; /* Look ahead is zero width. */
- look_only = 1;
- ret_val = emit_node (paren);
- } else if (paren == POS_BEHIND_OPEN || paren == NEG_BEHIND_OPEN) {
- *flag_param = WORST; /* Look behind is zero width. */
- look_only = 1;
- /* We'll overwrite the zero length later on, so we save the ptr */
- ret_val = emit_special (paren, 0, 0);
- emit_look_behind_bounds = ret_val + NODE_SIZE;
- } else if (paren == INSENSITIVE) {
- Is_Case_Insensitive = 1;
- } else if (paren == SENSITIVE) {
- Is_Case_Insensitive = 0;
- } else if (paren == NEWLINE) {
- Match_Newline = 1;
- } else if (paren == NO_NEWLINE) {
- Match_Newline = 0;
- }
- /* Pick up the branches, linking them together. */
- do {
- this_branch = alternative (&flags_local, &range_local);
- if (this_branch == NULL) {
- return (NULL);
- }
- if (first) {
- first = 0;
- *range_param = range_local;
- if (ret_val == NULL) {
- ret_val = this_branch;
- }
- } else if (range_param->lower >= 0) {
- if (range_local.lower >= 0) {
- if (range_local.lower < range_param->lower) {
- range_param->lower = range_local.lower;
- }
- if (range_local.upper > range_param->upper) {
- range_param->upper = range_local.upper;
- }
- } else {
- range_param->lower = -1; /* Branches have different lengths */
- range_param->upper = -1;
- }
- }
- tail (ret_val, this_branch); /* Connect BRANCH -> BRANCH. */
- /* If any alternative could be zero width, consider the whole
- parenthisized thing to be zero width. */
- if (! (flags_local & HAS_WIDTH)) {
- *flag_param &= ~HAS_WIDTH;
- }
- /* Are there more alternatives to process? */
- if (*Reg_Parse != '|') {
- break;
- }
- Reg_Parse++;
- } while (1);
- /* Make a closing node, and hook it on the end. */
- if (paren == PAREN) {
- ender = emit_node (CLOSE + this_paren);
- } else if (paren == NO_PAREN) {
- ender = emit_node (END);
- } else if (paren == POS_AHEAD_OPEN || paren == NEG_AHEAD_OPEN) {
- ender = emit_node (LOOK_AHEAD_CLOSE);
- } else if (paren == POS_BEHIND_OPEN || paren == NEG_BEHIND_OPEN) {
- ender = emit_node (LOOK_BEHIND_CLOSE);
- } else {
- ender = emit_node (NOTHING);
- }
- tail (ret_val, ender);
- /* Hook the tails of the branch alternatives to the closing node. */
- for (this_branch = ret_val; this_branch != NULL;) {
- branch_tail (this_branch, NODE_SIZE, ender);
- this_branch = next_ptr (this_branch);
- }
- /* Check for proper termination. */
- if (paren != NO_PAREN && *Reg_Parse++ != ')') {
- REG_FAIL (U"missing right parenthesis \')\'");
- } else if (paren == NO_PAREN && *Reg_Parse != '\0') {
- if (*Reg_Parse == ')') {
- REG_FAIL (U"missing left parenthesis \'(\'");
- } else {
- REG_FAIL (U"junk on end"); /* "Can't happen" - NOTREACHED */
- }
- }
- /* Check whether look behind has a fixed size */
- if (emit_look_behind_bounds) {
- if (range_param->lower < 0) {
- REG_FAIL (U"look-behind does not have a bounded size");
- }
- if (range_param->upper > 65535L) {
- REG_FAIL (U"max. look-behind size is too large (>65535)")
- }
- if (Code_Emit_Ptr != &Compute_Size) {
- *emit_look_behind_bounds++ = PUT_OFFSET_L (range_param->lower);
- *emit_look_behind_bounds++ = PUT_OFFSET_R (range_param->lower);
- *emit_look_behind_bounds++ = PUT_OFFSET_L (range_param->upper);
- *emit_look_behind_bounds = PUT_OFFSET_R (range_param->upper);
- }
- }
- /* For look ahead/behind, the length should be set to zero again */
- if (look_only) {
- range_param->lower = 0;
- range_param->upper = 0;
- }
- zero_width = 0;
- /* Set a bit in Closed_Parens to let future calls to function `back_ref'
- know that we have closed this set of parentheses. */
- if (paren == PAREN && this_paren <= (int) sizeof (Closed_Parens) * CHAR_BIT) {
- SET_BIT (Closed_Parens, this_paren);
- /* Determine if a parenthesized expression is modified by a quantifier
- that can have zero width. */
- if (* (Reg_Parse) == '?' || * (Reg_Parse) == '*') {
- zero_width++;
- } else if (* (Reg_Parse) == '{' && Brace_Char == '{') {
- if (* (Reg_Parse + 1) == ',' || * (Reg_Parse + 1) == '}') {
- zero_width++;
- } else if (* (Reg_Parse + 1) == '0') {
- i = 2;
- while (* (Reg_Parse + i) == '0') {
- i++;
- }
- if (* (Reg_Parse + i) == ',') {
- zero_width++;
- }
- }
- }
- }
- /* If this set of parentheses is known to never match the empty string, set
- a bit in Paren_Has_Width to let future calls to function back_ref know
- that this set of parentheses has non-zero width. This will allow star
- (*) or question (?) quantifiers to be aplied to a back-reference that
- refers to this set of parentheses. */
- if ( (*flag_param & HAS_WIDTH) &&
- paren == PAREN &&
- !zero_width &&
- this_paren <= (int) (sizeof (Paren_Has_Width) * CHAR_BIT)) {
- SET_BIT (Paren_Has_Width, this_paren);
- }
- Is_Case_Insensitive = old_sensitive;
- Match_Newline = old_newline;
- return (ret_val);
- }
- /*----------------------------------------------------------------------*
- * alternative
- *
- * Processes one alternative of an '|' operator. Connects the NEXT
- * pointers of each regex atom together sequentialy.
- *----------------------------------------------------------------------*/
- static char32 *alternative (int *flag_param, len_range *range_param) {
- char32 *ret_val;
- char32 *chain;
- char32 *latest;
- int flags_local;
- len_range range_local;
- *flag_param = WORST; /* Tentatively. */
- range_param->lower = 0; /* Idem */
- range_param->upper = 0;
- ret_val = emit_node (BRANCH);
- chain = NULL;
- /* Loop until we hit the start of the next alternative, the end of this set
- of alternatives (end of parentheses), or the end of the regex. */
- while (*Reg_Parse != '|' && *Reg_Parse != ')' && *Reg_Parse != '\0') {
- latest = piece (&flags_local, &range_local);
- if (latest == NULL) {
- return (NULL); /* Something went wrong. */
- }
- *flag_param |= flags_local & HAS_WIDTH;
- if (range_local.lower < 0) {
- /* Not a fixed length */
- range_param->lower = -1;
- range_param->upper = -1;
- } else if (range_param->lower >= 0) {
- range_param->lower += range_local.lower;
- range_param->upper += range_local.upper;
- }
- if (chain != NULL) { /* Connect the regex atoms together sequentialy. */
- tail (chain, latest);
- }
- chain = latest;
- }
- if (chain == NULL) { /* Loop ran zero times. */
- (void) emit_node (NOTHING);
- }
- return (ret_val);
- }
- /*----------------------------------------------------------------------*
- * piece - something followed by possible '*', '+', '?', or "{m,n}"
- *
- * Note that the branching code sequences used for the general cases of
- * *, +. ?, and {m,n} are somewhat optimized: they use the same
- * NOTHING node as both the endmarker for their branch list and the
- * body of the last branch. It might seem that this node could be
- * dispensed with entirely, but the endmarker role is not redundant.
- *----------------------------------------------------------------------*/
- static char32 *piece (int *flag_param, len_range *range_param) {
- char32 *ret_val;
- char32 *next;
- char32 op_code;
- unsigned long min_max [2] = {REG_ZERO, REG_INFINITY};
- int flags_local, i, brace_present = 0;
- int lazy = 0, comma_present = 0;
- int digit_present [2] = {0, 0};
- len_range range_local;
- ret_val = atom (&flags_local, &range_local);
- if (ret_val == NULL) {
- return (NULL); /* Something went wrong. */
- }
- op_code = *Reg_Parse;
- if (!IS_QUANTIFIER (op_code)) {
- *flag_param = flags_local;
- *range_param = range_local;
- return (ret_val);
- } else if (op_code == '{') { /* {n,m} quantifier present */
- brace_present++;
- Reg_Parse++;
- /* This code will allow specifying a counting range in any of the
- following forms:
- {m,n} between m and n.
- {,n} same as {0,n} or between 0 and infinity.
- {m,} same as {m,0} or between m and infinity.
- {m} same as {m,m} or exactly m.
- {,} same as {0,0} or between 0 and infinity or just '*'.
- {} same as {0,0} or between 0 and infinity or just '*'.
- Note that specifying a max of zero, {m,0} is not allowed in the regex
- itself, but it is implemented internally that way to support '*', '+',
- and {min,} constructs and signals an unlimited number. */
- for (i = 0; i < 2; i++) {
- /* Look for digits of number and convert as we go. The numeric maximum
- value for max and min of 65,535 is due to using 2 bytes to store
- each value in the compiled regex code. */
- while (Melder_isAsciiDecimalNumber (*Reg_Parse)) {
- /* (6553 * 10 + 6) > 65535 (16 bit max) */
- if ( (min_max [i] == 6553UL && (*Reg_Parse - '0') <= 5) ||
- (min_max [i] <= 6552UL)) {
- min_max [i] = (min_max [i] * 10UL) +
- (unsigned long) (*Reg_Parse - '0');
- Reg_Parse++;
- digit_present [i]++;
- } else {
- if (i == 0) {
- Melder_sprint (Error_Text,128, U"min operand of {", min_max [0], *Reg_Parse, U",???} > 65535");
- } else {
- Melder_sprint (Error_Text,128, U"max operand of {", min_max [0], U",", min_max [1], *Reg_Parse, U"} > 65535");
- }
- REG_FAIL (Error_Text);
- }
- }
- if (!comma_present && *Reg_Parse == ',') {
- comma_present++;
- Reg_Parse++;
- }
- }
- /* A max of zero cannot be specified directly in the regex since it would
- signal a max of infinity. This code specifically disallows `{0,0}',
- `{,0}', and `{0}' which really means nothing to humans but would be
- interpreted as `{0,infinity}' or `*' if we didn't make this check. */
- if (digit_present [0] && (min_max [0] == REG_ZERO) && !comma_present) {
- REG_FAIL (U"{0} is an invalid range");
- } else if (digit_present [0] && (min_max [0] == REG_ZERO) &&
- digit_present [1] && (min_max [1] == REG_ZERO)) {
- REG_FAIL (U"{0,0} is an invalid range");
- } else if (digit_present [1] && (min_max [1] == REG_ZERO)) {
- if (digit_present [0]) {
- Melder_sprint (Error_Text,128, U"{", min_max [0], U",0} is an invalid range");
- REG_FAIL (Error_Text);
- } else {
- REG_FAIL (U"{,0} is an invalid range");
- }
- }
- if (!comma_present) {
- min_max [1] = min_max [0];
- } /* {x} means {x,x} */
- if (*Reg_Parse != '}') {
- REG_FAIL (U"{m,n} specification missing right \'}\'");
- } else if (min_max [1] != REG_INFINITY && min_max [0] > min_max [1]) {
- /* Disallow a backward range. */
- Melder_sprint (Error_Text,128, U"{", min_max [0], U",", min_max [1], U"} is an invalid range");
- REG_FAIL (Error_Text);
- }
- }
- Reg_Parse++;
- /* Check for a minimal matching (non-greedy or "lazy") specification. */
- if (*Reg_Parse == '?') {
- lazy = 1;
- Reg_Parse++;
- }
- /* Avoid overhead of counting if possible */
- if (op_code == '{') {
- if (min_max [0] == REG_ZERO && min_max [1] == REG_INFINITY) {
- op_code = '*';
- } else if (min_max [0] == REG_ONE && min_max [1] == REG_INFINITY) {
- op_code = '+';
- } else if (min_max [0] == REG_ZERO && min_max [1] == REG_ONE) {
- op_code = '?';
- } else if (min_max [0] == REG_ONE && min_max [1] == REG_ONE) {
- /* "x{1,1}" is the same as "x". No need to pollute the compiled
- regex with such nonsense. */
- *flag_param = flags_local;
- *range_param = range_local;
- return (ret_val);
- } else if (Num_Braces > (int) UCHAR_MAX) {
- Melder_sprint (Error_Text,128, U"number of {m,n} constructs > ", UCHAR_MAX);
- REG_FAIL (Error_Text);
- }
- }
- if (op_code == '+') {
- min_max [0] = REG_ONE;
- }
- if (op_code == '?') {
- min_max [1] = REG_ONE;
- }
- /* It is dangerous to apply certain quantifiers to a possibly zero width
- item. */
- if (! (flags_local & HAS_WIDTH)) {
- if (brace_present) {
- Melder_sprint (Error_Text,128, U"{", min_max [0], U",", min_max [1], U"} operand could be empty");
- } else {
- Melder_sprint (Error_Text,128, op_code, U" operand could be empty");
- }
- REG_FAIL (Error_Text);
- }
- *flag_param = (min_max [0] > REG_ZERO) ? (WORST | HAS_WIDTH) : WORST;
- if (range_local.lower >= 0) {
- if (min_max[1] != REG_INFINITY) {
- range_param->lower = range_local.lower * min_max[0];
- range_param->upper = range_local.upper * min_max[1];
- } else {
- range_param->lower = -1; /* Not a fixed-size length */
- range_param->upper = -1;
- }
- } else {
- range_param->lower = -1; /* Not a fixed-size length */
- range_param->upper = -1;
- }
- /*---------------------------------------------------------------------*
- * Symbol Legend For Node Structure Diagrams
- *---------------------------------------------------------------------*
- * (...) = general grouped thing
- * B = (B)ranch, K = bac(K), N = (N)othing
- * I = (I)nitialize count, C = Increment (C)ount
- * T~m = (T)est against mini(m)um- go to NEXT pointer if >= operand
- * T~x = (T)est against ma(x)imum- go to NEXT pointer if >= operand
- * '~' = NEXT pointer, \___| = forward pointer, |___/ = Backward pointer
- *---------------------------------------------------------------------*/
- if (op_code == '*' && (flags_local & SIMPLE)) {
- insert ( (lazy ? LAZY_STAR : STAR), ret_val, 0UL, 0UL, 0);
- } else if (op_code == '+' && (flags_local & SIMPLE)) {
- insert (lazy ? LAZY_PLUS : PLUS, ret_val, 0UL, 0UL, 0);
- } else if (op_code == '?' && (flags_local & SIMPLE)) {
- insert (lazy ? LAZY_QUESTION : QUESTION, ret_val, 0UL, 0UL, 0);
- } else if (op_code == '{' && (flags_local & SIMPLE)) {
- insert (lazy ? LAZY_BRACE : BRACE, ret_val, min_max [0], min_max [1], 0);
- } else if ( (op_code == '*' || op_code == '+') && lazy) {
- /* Node structure for (x)*? Node structure for (x)+? construct.
- * construct. (Same as (x)*? except for initial
- * forward jump into parenthesis.)
- *
- * ___6____
- * _______5_______ /________|______
- * | _4__ 1_\ /| ____ | _\
- * |/ | / |\ / |/ | | / |\
- * B~ N~ B~ (...)~ K~ N~ N~ B~ N~ B~ (...)~ K~ N~
- * \ \___2_______| \ \___________|
- * \_____3_______| \_____________|
- *
- */
- tail (ret_val, emit_node (BACK)); /* 1 */
- (void) insert (BRANCH, ret_val, 0UL, 0UL, 0); /* 2,4 */
- (void) insert (NOTHING, ret_val, 0UL, 0UL, 0); /* 3 */
- next = emit_node (NOTHING); /* 2,3 */
- offset_tail (ret_val, NODE_SIZE, next); /* 2 */
- tail (ret_val, next); /* 3 */
- insert (BRANCH, ret_val, 0UL, 0UL, 0); /* 4,5 */
- tail (ret_val, ret_val + (2 * NODE_SIZE)); /* 4 */
- offset_tail (ret_val, 3 * NODE_SIZE, ret_val); /* 5 */
- if (op_code == '+') {
- insert (NOTHING, ret_val, 0UL, 0UL, 0); /* 6 */
- tail (ret_val, ret_val + (4 * NODE_SIZE)); /* 6 */
- }
- } else if (op_code == '*') {
- /* Node structure for (x)* construct.
- * ____1_____
- * | \
- * B~ (...)~ K~ B~ N~
- * \ \_|2 |\_|
- * \__3_______| 4
- */
- insert (BRANCH, ret_val, 0UL, 0UL, 0); /* 1,3 */
- offset_tail (ret_val, NODE_SIZE, emit_node (BACK)); /* 2 */
- offset_tail (ret_val, NODE_SIZE, ret_val); /* 1 */
- tail (ret_val, emit_node (BRANCH)); /* 3 */
- tail (ret_val, emit_node (NOTHING)); /* 4 */
- } else if (op_code == '+') {
- /* Node structure for (x)+ construct.
- *
- * ____2_____
- * | \
- * (...)~ B~ K~ B~ N~
- * \_|\____|\_|
- * 1 3 4
- */
- next = emit_node (BRANCH); /* 1 */
- tail (ret_val, next); /* 1 */
- tail (emit_node (BACK), ret_val); /* 2 */
- tail (next, emit_node (BRANCH)); /* 3 */
- tail (ret_val, emit_node (NOTHING)); /* 4 */
- } else if (op_code == '?' && lazy) {
- /* Node structure for (x)?? construct.
- * _4__ 1_
- * / | / |
- * B~ N~ B~ (...)~ N~
- * \ \___2____|
- * \_____3____|
- */
- (void) insert (BRANCH, ret_val, 0UL, 0UL, 0); /* 2,4 */
- (void) insert (NOTHING, ret_val, 0UL, 0UL, 0); /* 3 */
- next = emit_node (NOTHING); /* 1,2,3 */
- offset_tail (ret_val, 2 * NODE_SIZE, next); /* 1 */
- offset_tail (ret_val, NODE_SIZE, next); /* 2 */
- tail (ret_val, next); /* 3 */
- insert (BRANCH, ret_val, 0UL, 0UL, 0); /* 4 */
- tail (ret_val, (ret_val + (2 * NODE_SIZE))); /* 4 */
- } else if (op_code == '?') {
- /* Node structure for (x)? construct.
- * ___1____ _2
- * / |/ |
- * B~ (...)~ B~ N~
- * \__3_|
- */
- insert (BRANCH, ret_val, 0UL, 0UL, 0); /* 1 */
- tail (ret_val, emit_node (BRANCH)); /* 1 */
- next = emit_node (NOTHING); /* 2,3 */
- tail (ret_val, next); /* 2 */
- offset_tail (ret_val, NODE_SIZE, next); /* 3 */
- } else if (op_code == '{' && min_max [0] == min_max [1]) {
- /* Node structure for (x){m}, (x){m}?, (x){m,m}, or (x){m,m}? constructs.
- * Note that minimal and maximal matching mean the same thing when we
- * specify the minimum and maximum to be the same value.
- * _______3_____
- * | 1_ _2 \
- * | / |/ | \
- * I~ (...)~ C~ T~m K~ N~
- * \_| \_____|
- * 5 4
- */
- tail (ret_val, emit_special (INC_COUNT, 0UL, Num_Braces)); /* 1 */
- tail (ret_val, emit_special (TEST_COUNT, min_max [0], Num_Braces));/* 2 */
- tail (emit_node (BACK), ret_val); /* 3 */
- tail (ret_val, emit_node (NOTHING)); /* 4 */
- next = insert (INIT_COUNT, ret_val, 0UL, 0UL, Num_Braces); /* 5 */
- tail (ret_val, next); /* 5 */
- Num_Braces++;
- } else if (op_code == '{' && lazy) {
- if (min_max [0] == REG_ZERO && min_max [1] != REG_INFINITY) {
- /* Node structure for (x){0,n}? or {,n}? construct.
- * _________3____________
- * 8_| _4__ 1_ _2 \
- * / |/ | / |/ | \
- * I~ B~ N~ B~ (...)~ C~ T~x K~ N~
- * \ \ \__7__|
- * \ \_________6_______|
- * \______5____________|
- */
- tail (ret_val, emit_special (INC_COUNT, 0UL, Num_Braces)); /* 1 */
- next = emit_special (TEST_COUNT, min_max [0], Num_Braces); /* 2,7 */
- tail (ret_val, next); /* 2 */
- (void) insert (BRANCH, ret_val, 0UL, 0UL, Num_Braces); /* 4,6 */
- (void) insert (NOTHING, ret_val, 0UL, 0UL, Num_Braces); /* 5 */
- (void) insert (BRANCH, ret_val, 0UL, 0UL, Num_Braces); /* 3,4,8 */
- tail (emit_node (BACK), ret_val); /* 3 */
- tail (ret_val, ret_val + (2 * NODE_SIZE)); /* 4 */
- next = emit_node (NOTHING); /* 5,6,7 */
- offset_tail (ret_val, NODE_SIZE, next); /* 5 */
- offset_tail (ret_val, 2 * NODE_SIZE, next); /* 6 */
- offset_tail (ret_val, 3 * NODE_SIZE, next); /* 7 */
- next = insert (INIT_COUNT, ret_val, 0UL, 0UL, Num_Braces); /* 8 */
- tail (ret_val, next); /* 8 */
- } else if (min_max [0] > REG_ZERO && min_max [1] == REG_INFINITY) {
- /* Node structure for (x){m,}? construct.
- * ______8_________________
- * | _______3_____ \
- * | _7__ | 1_ _2 \ \
- * |/ | | / |/ | \ \
- * I~ B~ N~ B~ (...)~ C~ T~m K~ K~ N~
- * \_____\__\_| \_4___| |
- * 9 \ \_________5__________|
- * \_______6______________|
- */
- tail (ret_val, emit_special (INC_COUNT, 0UL, Num_Braces)); /* 1 */
- next = emit_special (TEST_COUNT, min_max [0], Num_Braces); /* 2,4 */
- tail (ret_val, next); /* 2 */
- tail (emit_node (BACK), ret_val); /* 3 */
- tail (ret_val, emit_node (BACK)); /* 4 */
- (void) insert (BRANCH, ret_val, 0UL, 0UL, 0); /* 5,7 */
- (void) insert (NOTHING, ret_val, 0UL, 0UL, 0); /* 6 */
- next = emit_node (NOTHING); /* 5,6 */
- offset_tail (ret_val, NODE_SIZE, next); /* 5 */
- tail (ret_val, next); /* 6 */
- (void) insert (BRANCH, ret_val, 0UL, 0UL, 0); /* 7,8 */
- tail (ret_val, ret_val + (2 * NODE_SIZE)); /* 7 */
- offset_tail (ret_val, 3 * NODE_SIZE, ret_val); /* 8 */
- (void) insert (INIT_COUNT, ret_val, 0UL, 0UL, Num_Braces); /* 9 */
- tail (ret_val, ret_val + INDEX_SIZE + (4 * NODE_SIZE)); /* 9 */
- } else {
- /* Node structure for (x){m,n}? construct.
- * ______9_____________________
- * | _____________3___ \
- * | __8_ | 1_ _2 \ \
- * |/ | | / |/ | \ \
- * I~ B~ N~ B~ (...)~ C~ T~x T~m K~ K~ N~
- * \_____\__\_| \ \__4__| |
- * 10 \ \ \_7_________|
- * \ \_________6_____________|
- * \_______5_________________|
- */
- tail (ret_val, emit_special (INC_COUNT, 0UL, Num_Braces)); /* 1 */
- next = emit_special (TEST_COUNT, min_max [1], Num_Braces); /* 2,7 */
- tail (ret_val, next); /* 2 */
- next = emit_special (TEST_COUNT, min_max [0], Num_Braces); /* 4 */
- tail (emit_node (BACK), ret_val); /* 3 */
- tail (next, emit_node (BACK)); /* 4 */
- (void) insert (BRANCH, ret_val, 0UL, 0UL, 0); /* 6,8 */
- (void) insert (NOTHING, ret_val, 0UL, 0UL, 0); /* 5 */
- (void) insert (BRANCH, ret_val, 0UL, 0UL, 0); /* 8,9 */
- next = emit_node (NOTHING); /* 5,6,7 */
- offset_tail (ret_val, NODE_SIZE, next); /* 5 */
- offset_tail (ret_val, 2 * NODE_SIZE, next); /* 6 */
- offset_tail (ret_val, 3 * NODE_SIZE, next); /* 7 */
- tail (ret_val, ret_val + (2 * NODE_SIZE)); /* 8 */
- offset_tail (next, -NODE_SIZE, ret_val); /* 9 */
- insert (INIT_COUNT, ret_val, 0UL, 0UL, Num_Braces); /* 10 */
- tail (ret_val, ret_val + INDEX_SIZE + (4 * NODE_SIZE)); /* 10 */
- }
- Num_Braces++;
- } else if (op_code == '{') {
- if (min_max [0] == REG_ZERO && min_max [1] != REG_INFINITY) {
- /* Node structure for (x){0,n} or (x){,n} construct.
- *
- * ___3____________
- * | 1_ _2 \ 5_
- * | / |/ | \ / |
- * I~ B~ (...)~ C~ T~x K~ B~ N~
- * \_|\ \_6___|__|
- * 7 \________4________|
- */
- tail (ret_val, emit_special (INC_COUNT, 0UL, Num_Braces)); /* 1 */
- next = emit_special (TEST_COUNT, min_max [1], Num_Braces); /* 2,6 */
- tail (ret_val, next); /* 2 */
- (void) insert (BRANCH, ret_val, 0UL, 0UL, 0); /* 3,4,7 */
- tail (emit_node (BACK), ret_val); /* 3 */
- next = emit_node (BRANCH); /* 4,5 */
- tail (ret_val, next); /* 4 */
- tail (next, emit_node (NOTHING)); /* 5,6 */
- offset_tail (ret_val, NODE_SIZE, next); /* 6 */
- next = insert (INIT_COUNT, ret_val, 0UL, 0UL, Num_Braces); /* 7 */
- tail (ret_val, next); /* 7 */
- } else if (min_max [0] > REG_ZERO && min_max [1] == REG_INFINITY) {
- /* Node structure for (x){m,} construct.
- * __________4________
- * | __3__________ \
- * _|___| 1_ _2 \ \ _7
- * / | 8 | / |/ | \ \ / |
- * I~ B~ (...)~ C~ T~m K~ K~ B~ N~
- * \ \_5___| |
- * \__________6__________|
- */
- tail (ret_val, emit_special (INC_COUNT, 0UL, Num_Braces)); /* 1 */
- next = emit_special (TEST_COUNT, min_max [0], Num_Braces); /* 2 */
- tail (ret_val, next); /* 2 */
- tail (emit_node (BACK), ret_val); /* 3 */
- (void) insert (BRANCH, ret_val, 0UL, 0UL, 0); /* 4,6 */
- next = emit_node (BACK); /* 4 */
- tail (next, ret_val); /* 4 */
- offset_tail (ret_val, NODE_SIZE, next); /* 5 */
- tail (ret_val, emit_node (BRANCH)); /* 6 */
- tail (ret_val, emit_node (NOTHING)); /* 7 */
- insert (INIT_COUNT, ret_val, 0UL, 0UL, Num_Braces); /* 8 */
- tail (ret_val, ret_val + INDEX_SIZE + (2 * NODE_SIZE)); /* 8 */
- } else {
- /* Node structure for (x){m,n} construct.
- * _____6________________
- * | _____________3___ \
- * 9_|__| 1_ _2 \ \ _8
- * / | | / |/ | \ \ / |
- * I~ B~ (...)~ C~ T~x T~m K~ K~ B~ N~
- * \ \ \__4__| | |
- * \ \_7_________|__|
- * \_________5_____________|
- */
- tail (ret_val, emit_special (INC_COUNT, 0UL, Num_Braces)); /* 1 */
- next = emit_special (TEST_COUNT, min_max [1], Num_Braces); /* 2,4 */
- tail (ret_val, next); /* 2 */
- next = emit_special (TEST_COUNT, min_max [0], Num_Braces); /* 4 */
- tail (emit_node (BACK), ret_val); /* 3 */
- tail (next, emit_node (BACK)); /* 4 */
- (void) insert (BRANCH, ret_val, 0UL, 0UL, 0); /* 5,6 */
- next = emit_node (BRANCH); /* 5,8 */
- tail (ret_val, next); /* 5 */
- offset_tail (next, -NODE_SIZE, ret_val); /* 6 */
- next = emit_node (NOTHING); /* 7,8 */
- offset_tail (ret_val, NODE_SIZE, next); /* 7 */
- offset_tail (next, -NODE_SIZE, next); /* 8 */
- (void) insert (INIT_COUNT, ret_val, 0UL, 0UL, Num_Braces); /* 9 */
- tail (ret_val, ret_val + INDEX_SIZE + (2 * NODE_SIZE)); /* 9 */
- }
- Num_Braces++;
- } else {
- /* We get here if the IS_QUANTIFIER macro is not coordinated properly
- with this function. */
- REG_FAIL (U"internal error #2, `piece\'");
- }
- if (IS_QUANTIFIER (*Reg_Parse)) {
- if (op_code == '{') {
- Melder_sprint (Error_Text,128, U"nested quantifiers, {m,n}", *Reg_Parse);
- } else {
- Melder_sprint (Error_Text,128, U"nested quantifiers, ", op_code, *Reg_Parse);
- }
- REG_FAIL (Error_Text);
- }
- return (ret_val);
- }
- /*----------------------------------------------------------------------*
- * atom
- *
- * Process one regex item at the lowest level
- *
- * OPTIMIZATION: Lumps a continuous sequence of ordinary characters
- * together so that it can turn them into a single EXACTLY node, which
- * is smaller to store and faster to run.
- *----------------------------------------------------------------------*/
- static char32 *atom (int *flag_param, len_range *range_param) {
- char32 *ret_val;
- char32 test;
- int flags_local;
- len_range range_local;
- *flag_param = WORST; /* Tentatively. */
- range_param->lower = 0; /* Idem */
- range_param->upper = 0;
- /* Process any regex comments, e.g. `(?# match next token->)'. The
- terminating right parenthesis cannot be escaped. The comment stops at
- the first right parenthesis encountered (or the end of the regex
- string)... period. Handles multiple sequential comments,
- e.g. `(?# one)(?# two)...' */
- while (*Reg_Parse == '(' &&
- * (Reg_Parse + 1) == '?' &&
- * (Reg_Parse + 2) == '#') {
- Reg_Parse += 3;
- while (*Reg_Parse != ')' && *Reg_Parse != '\0') {
- Reg_Parse++;
- }
- if (*Reg_Parse == ')') {
- Reg_Parse++;
- }
- if (*Reg_Parse == ')' || *Reg_Parse == '|' || *Reg_Parse == '\0') {
- /* Hit end of regex string or end of parenthesized regex; have to
- return "something" (i.e. a NOTHING node) to avoid generating an
- error. */
- ret_val = emit_node (NOTHING);
- return (ret_val);
- }
- }
- switch (*Reg_Parse ++) {
- case U'^':
- ret_val = emit_node (BOL);
- break;
- case U'$':
- ret_val = emit_node (EOL);
- break;
- case U'<':
- ret_val = emit_node (BOWORD);
- break;
- case U'>':
- ret_val = emit_node (EOWORD);
- break;
- case U'.':
- if (Match_Newline) {
- ret_val = emit_node (EVERY);
- } else {
- ret_val = emit_node (ANY);
- }
- *flag_param |= (HAS_WIDTH | SIMPLE);
- range_param->lower = 1;
- range_param->upper = 1;
- break;
- case U'(':
- if (*Reg_Parse == U'?') { /* Special parenthetical expression */
- Reg_Parse++;
- range_local.lower = 0; /* Make sure it is always used */
- range_local.upper = 0;
- if (*Reg_Parse == U':') {
- Reg_Parse ++;
- ret_val = chunk (NO_CAPTURE, &flags_local, &range_local);
- } else if (*Reg_Parse == U'=') {
- Reg_Parse ++;
- ret_val = chunk (POS_AHEAD_OPEN, &flags_local, &range_local);
- } else if (*Reg_Parse == U'!') {
- Reg_Parse ++;
- ret_val = chunk (NEG_AHEAD_OPEN, &flags_local, &range_local);
- } else if (*Reg_Parse == U'i') {
- Reg_Parse ++;
- ret_val = chunk (INSENSITIVE, &flags_local, &range_local);
- } else if (*Reg_Parse == U'I') {
- Reg_Parse ++;
- ret_val = chunk (SENSITIVE, &flags_local, &range_local);
- } else if (*Reg_Parse == U'n') {
- Reg_Parse ++;
- ret_val = chunk (NEWLINE, &flags_local, &range_local);
- } else if (*Reg_Parse == U'N') {
- Reg_Parse ++;
- ret_val = chunk (NO_NEWLINE, &flags_local, &range_local);
- } else if (*Reg_Parse == U'<') {
- Reg_Parse ++;
- if (*Reg_Parse == U'=') {
- Reg_Parse ++;
- ret_val = chunk (POS_BEHIND_OPEN, &flags_local, &range_local);
- } else if (*Reg_Parse == U'!') {
- Reg_Parse ++;
- ret_val = chunk (NEG_BEHIND_OPEN, &flags_local, &range_local);
- } else {
- Melder_sprint (Error_Text,128, U"invalid look-behind syntax, \"(?<", *Reg_Parse, U"...)\"");
- REG_FAIL (Error_Text)
- }
- } else {
- Melder_sprint (Error_Text,128, U"invalid grouping syntax, \"(?", *Reg_Parse, U"...)\"");
- REG_FAIL (Error_Text)
- }
- } else { /* Normal capturing parentheses */
- ret_val = chunk (PAREN, & flags_local, & range_local);
- }
- if (! ret_val)
- return nullptr; // something went wrong
- /* Add HAS_WIDTH flag if it was set by call to chunk. */
- *flag_param |= flags_local & HAS_WIDTH;
- *range_param = range_local;
- break;
- case U'\0':
- case U'|':
- case U')':
- REG_FAIL (U"internal error #3, `atom\'") // supposed to be caught earlier
- case U'?':
- case U'+':
- case U'*':
- Melder_sprint (Error_Text,128, * (Reg_Parse - 1), U" follows nothing");
- REG_FAIL (Error_Text)
- case U'{':
- if (Enable_Counting_Quantifier) {
- REG_FAIL (U"{m,n} follows nothing")
- } else {
- ret_val = emit_node (EXACTLY); /* Treat braces as literals. */
- emit_byte (U'{');
- emit_byte (U'\0');
- range_param->lower = 1;
- range_param->upper = 1;
- }
- break;
- case U'[': {
- char32 last_emit = U'\0';
- /* Handle characters that can only occur at the start of a class. */
- if (*Reg_Parse == U'^') { /* Complement of range. */
- ret_val = emit_node (ANY_BUT);
- Reg_Parse ++;
- /* All negated classes include newline unless escaped with
- a "(?n)" switch. */
- if (!Match_Newline) {
- emit_byte (U'\n');
- }
- } else {
- ret_val = emit_node (ANY_OF);
- }
- if (*Reg_Parse == U']' || *Reg_Parse == U'-') {
- /* If '-' or ']' is the first character in a class,
- it is a literal character in the class. */
- last_emit = *Reg_Parse;
- emit_byte (*Reg_Parse);
- Reg_Parse ++;
- }
- /* Handle the rest of the class characters. */
- while (*Reg_Parse != U'\0' && *Reg_Parse != U']') {
- if (*Reg_Parse == U'-') { /* Process a range, e.g [a-z]. */
- Reg_Parse ++;
- if (*Reg_Parse == U']' || *Reg_Parse == U'\0') {
- /* If '-' is the last character in a class it is a literal
- character. If `Reg_Parse' points to the end of the
- regex string, an error will be generated later. */
- emit_byte (U'-');
- last_emit = U'-';
- } else {
- /* We must get the range starting character value from the
- emitted code since it may have been an escaped
- character. `second_value' is set one larger than the
- just emitted character value. This is done since
- `second_value' is used as the start value for the loop
- that emits the values in the range. Since we have
- already emitted the first character of the class, we do
- not want to emit it again. */
- char32 second_value = last_emit + 1, last_value;
- if (*Reg_Parse == U'\\') {
- /* Handle escaped characters within a class range.
- Specifically disallow shortcut escapes as the end of
- a class range. To allow this would be ambiguous
- since shortcut escapes represent a set of characters,
- and it would not be clear which character of the
- class should be treated as the "last" character. */
- Reg_Parse++;
- if ((test = numeric_escape (*Reg_Parse, &Reg_Parse)) != U'\0') {
- last_value = test;
- } else if ((test = literal_escape (*Reg_Parse)) != U'\0') {
- last_value = test;
- } else if (shortcut_escape (*Reg_Parse, NULL, CHECK_CLASS_ESCAPE)) {
- Melder_sprint (Error_Text, 128, U"\\", *Reg_Parse, U" is not allowed as range operand");
- REG_FAIL (Error_Text)
- } else {
- Melder_sprint (Error_Text, 128, U"\\", *Reg_Parse, U" is an invalid char class escape sequence");
- REG_FAIL (Error_Text)
- }
- } else {
- last_value = U_CHAR_AT (Reg_Parse);
- }
- if (Is_Case_Insensitive) {
- second_value = Melder_toLowerCase (second_value); // TODO: this should do casefolding
- last_value = Melder_toLowerCase (last_value);
- }
- /* For case insensitive, something like [A-_] will
- generate an error here since ranges are converted to
- lower case. */
- if (second_value - 1 > last_value)
- REG_FAIL (U"invalid [] range")
- /* If only one character in range (e.g [a-a]) then this
- loop is not run since the first character of any range
- was emitted by the previous iteration of while loop. */
- for (; second_value <= last_value; second_value++)
- emit_class_byte (second_value);
- last_emit = (char32) last_value;
- Reg_Parse ++;
- } /* End class character range code. */
- } else if (*Reg_Parse == '\\') {
- Reg_Parse++;
- if ( (test = numeric_escape (*Reg_Parse, &Reg_Parse)) != '\0') {
- emit_class_byte (test);
- last_emit = test;
- } else if ( (test = literal_escape (*Reg_Parse)) != '\0') {
- emit_byte (test);
- last_emit = test;
- } else if (shortcut_escape (*Reg_Parse,
- NULL,
- CHECK_CLASS_ESCAPE)) {
- if (* (Reg_Parse + 1) == '-') {
- /* Specifically disallow shortcut escapes as the start
- of a character class range (see comment above.) */
- Melder_sprint (Error_Text,128, U"\\", *Reg_Parse, U" not allowed as range operand");
- REG_FAIL (Error_Text);
- } else {
- /* Emit the bytes that are part of the shortcut
- escape sequence's range (e.g. \d = 0123456789) */
- Melder_sprint (Error_Text,128, U"Cannot have escape sequences such as \\", *Reg_Parse, U" inside a class");
- }
- } else {
- Melder_sprint (Error_Text,128, U"\\", *Reg_Parse, U" is an invalid char class escape sequence");
- REG_FAIL (Error_Text);
- }
- Reg_Parse++;
- /* End of class escaped sequence code */
- } else {
- emit_class_byte (*Reg_Parse); /* Ordinary class character. */
- last_emit = *Reg_Parse;
- Reg_Parse++;
- }
- } /* End of while (*Reg_Parse != U'\0' && *Reg_Parse != U']') */
- if (*Reg_Parse != U']') {
- REG_FAIL (U"missing right \']\'");
- }
- emit_byte (U'\0');
- /* NOTE: it is impossible to specify an empty class. This is
- because [] would be interpreted as "begin character class"
- followed by a literal ']' character and no "end character class"
- delimiter (']'). Because of this, it is always safe to assume
- that a class HAS_WIDTH. */
- Reg_Parse ++;
- *flag_param |= HAS_WIDTH | SIMPLE;
- range_param->lower = 1;
- range_param->upper = 1;
- }
- break; /* End of character class code. */
- case U'\\':
- /* Force Error_Text to have a length of zero. This way we can tell if
- either of the calls to shortcut_escape() or back_ref() fill
- Error_Text with an error message. */
- Error_Text [0] = U'\0';
- if ( (ret_val = shortcut_escape (*Reg_Parse, flag_param, EMIT_NODE))) {
- Reg_Parse ++;
- range_param->lower = 1;
- range_param->upper = 1;
- break;
- } else if ( (ret_val = back_ref (Reg_Parse, flag_param, EMIT_NODE))) {
- /* Can't make any assumptions about a back-reference as to SIMPLE
- or HAS_WIDTH. For example (^|<) is neither simple nor has
- width. So we don't flip bits in flag_param here. */
- Reg_Parse++;
- /* Back-references always have an unknown length */
- range_param->lower = -1;
- range_param->upper = -1;
- break;
- }
- if (Error_Text [0] != U'\0')
- REG_FAIL (Error_Text)
- /* At this point it is apparent that the escaped character is not a
- shortcut escape or back-reference. Back up one character to allow
- the default code to include it as an ordinary character. */
- /* Fall through to Default case to handle literal escapes and numeric
- escapes. */
- default:
- Reg_Parse --; /* If we fell through from the above code, we are now
- pointing at the back slash (\) character. */
- {
- char32 *parse_save;
- int len = 0;
- if (Is_Case_Insensitive) {
- ret_val = emit_node (SIMILAR);
- } else {
- ret_val = emit_node (EXACTLY);
- }
- /* Loop until we find a meta character, shortcut escape, back
- reference, or end of regex string. */
- for (; *Reg_Parse != '\0' && ! str32chr (Meta_Char, *Reg_Parse); len++) {
- /* Save where we are in case we have to back
- this character out. */
- parse_save = Reg_Parse;
- if (*Reg_Parse == U'\\') {
- Reg_Parse ++; // point to escaped character
- Error_Text [0] = U'\0'; // see comment above
- if ( (test = numeric_escape (*Reg_Parse, &Reg_Parse))) {
- if (Is_Case_Insensitive) {
- emit_byte (Melder_toLowerCase (test));
- } else {
- emit_byte (test);
- }
- } else if ( (test = literal_escape (*Reg_Parse))) {
- emit_byte (test);
- } else if (back_ref (Reg_Parse, NULL, CHECK_ESCAPE)) {
- /* Leave back reference for next `atom' call */
- Reg_Parse --; break;
- } else if (shortcut_escape (*Reg_Parse, NULL, CHECK_ESCAPE)) {
- /* Leave shortcut escape for next `atom' call */
- Reg_Parse --; break;
- } else {
- if (Error_Text [0] != U'\0') {
- /* None of the above calls generated an error message
- so generate our own here. */
- Melder_sprint (Error_Text,128, U"\\", *Reg_Parse, U" is an invalid escape sequence");
- }
- REG_FAIL (Error_Text)
- }
- Reg_Parse ++;
- } else {
- /* Ordinary character */
- if (Is_Case_Insensitive) {
- emit_byte (Melder_toLowerCase (*Reg_Parse));
- } else {
- emit_byte (*Reg_Parse);
- }
- Reg_Parse ++;
- }
- /* If next regex token is a quantifier (?, +. *, or {m,n}) and
- our EXACTLY node so far is more than one character, leave the
- last character to be made into an EXACTLY node one character
- wide for the multiplier to act on. For example 'abcd* would
- have an EXACTLY node with an 'abc' operand followed by a STAR
- node followed by another EXACTLY node with a 'd' operand. */
- if (IS_QUANTIFIER (*Reg_Parse) && len > 0) {
- Reg_Parse = parse_save; /* Point to previous regex token. */
- if (Code_Emit_Ptr == &Compute_Size) {
- Reg_Size--;
- } else {
- Code_Emit_Ptr--; /* Write over previously emitted byte. */
- }
- break;
- }
- }
- if (len <= 0)
- REG_FAIL (U"internal error #4, `atom\'")
- *flag_param |= HAS_WIDTH;
- if (len == 1)
- *flag_param |= SIMPLE;
- range_param->lower = len;
- range_param->upper = len;
- emit_byte (U'\0');
- }
- } /* END switch (*Reg_Parse++) */
- return (ret_val);
- }
- /*----------------------------------------------------------------------*
- * emit_node
- *
- * Emit (if appropriate) the op code for a regex node atom.
- *
- * The NEXT pointer is initialized to NULL.
- *
- * Returns a pointer to the START of the emitted node.
- *----------------------------------------------------------------------*/
- static char32 *emit_node (int op_code) {
- char32 *ret_val = Code_Emit_Ptr; /* Return address of start of node */
- if (ret_val == & Compute_Size) {
- Reg_Size += NODE_SIZE;
- } else {
- char32 *ptr = ret_val;
- *ptr ++ = op_code;
- *ptr ++ = U'\0'; // null "NEXT" pointer
- *ptr ++ = U'\0';
- Code_Emit_Ptr = ptr;
- }
- return ret_val;
- }
- /*----------------------------------------------------------------------*
- * emit_byte
- *
- * Emit (if appropriate) a byte of code (usually part of an operand.)
- *----------------------------------------------------------------------*/
- static void emit_byte (char32 c) {
- if (Code_Emit_Ptr == & Compute_Size) {
- Reg_Size ++;
- } else {
- *Code_Emit_Ptr ++ = c;
- }
- }
- /*----------------------------------------------------------------------*
- * emit_class_byte
- *
- * Emit (if appropriate) a byte of code (usually part of a character
- * class operand.)
- *----------------------------------------------------------------------*/
- static void emit_class_byte (char32 c) {
- if (Code_Emit_Ptr == & Compute_Size) {
- Reg_Size ++;
- if (Is_Case_Insensitive && Melder_isLetter (c)) {
- Reg_Size ++;
- }
- } else if (Is_Case_Insensitive && Melder_isLetter (c)) {
- /* For case-insensitive character classes, emit both upper and lower case
- versions of alphabetical characters. */
- *Code_Emit_Ptr ++ = Melder_toLowerCase (c);
- *Code_Emit_Ptr ++ = Melder_toUpperCase (c);
- } else {
- *Code_Emit_Ptr ++ = c;
- }
- }
- /*----------------------------------------------------------------------*
- * emit_special
- *
- * Emit nodes that need special processing.
- *----------------------------------------------------------------------*/
- static char32 *emit_special (
- char32 op_code,
- unsigned long test_val,
- int index)
- {
- char32 *ret_val = & Compute_Size;
- char32 *ptr;
- if (Code_Emit_Ptr == & Compute_Size) {
- switch (op_code) {
- case POS_BEHIND_OPEN:
- case NEG_BEHIND_OPEN:
- Reg_Size += LENGTH_SIZE; /* Length of the look-behind match */
- Reg_Size += NODE_SIZE; /* Make room for the node */
- break;
- case TEST_COUNT:
- Reg_Size += NEXT_PTR_SIZE; /* Make room for a test value. */
- // ppgb FALLTHROUGH (both test and inc require index)
- case INC_COUNT:
- Reg_Size += INDEX_SIZE; /* Make room for an index value. */
- // ppgb FALLTHROUGH (everything requires node)
- default:
- Reg_Size += NODE_SIZE; /* Make room for the node. */
- }
- } else {
- ret_val = emit_node (op_code); /* Return the address for start of node. */
- ptr = Code_Emit_Ptr;
- if (op_code == INC_COUNT || op_code == TEST_COUNT) {
- *ptr ++ = (char32) index;
- if (op_code == TEST_COUNT) {
- *ptr ++ = PUT_OFFSET_L (test_val);
- *ptr ++ = PUT_OFFSET_R (test_val);
- }
- } else if (op_code == POS_BEHIND_OPEN || op_code == NEG_BEHIND_OPEN) {
- *ptr ++ = PUT_OFFSET_L (test_val);
- *ptr ++ = PUT_OFFSET_R (test_val);
- *ptr ++ = PUT_OFFSET_L (test_val);
- *ptr ++ = PUT_OFFSET_R (test_val);
- }
- Code_Emit_Ptr = ptr;
- }
- return ret_val;
- }
- /*----------------------------------------------------------------------*
- * insert
- *
- * Insert a node in front of already emitted node(s). Means relocating
- * the operand. Code_Emit_Ptr points one byte past the just emitted
- * node and operand. The parameter `insert_pos' points to the location
- * where the new node is to be inserted.
- *----------------------------------------------------------------------*/
- static char32 *insert (
- char32 op,
- char32 *insert_pos,
- long min,
- long max,
- int index) {
- char32 *src;
- char32 *dst;
- char32 *place;
- int insert_size = NODE_SIZE;
- if (op == BRACE || op == LAZY_BRACE) {
- /* Make room for the min and max values. */
- insert_size += (2 * NEXT_PTR_SIZE);
- } else if (op == INIT_COUNT) {
- /* Make room for an index value . */
- insert_size += INDEX_SIZE;
- }
- if (Code_Emit_Ptr == &Compute_Size) {
- Reg_Size += insert_size;
- return &Compute_Size;
- }
- src = Code_Emit_Ptr;
- Code_Emit_Ptr += insert_size;
- dst = Code_Emit_Ptr;
- /* Relocate the existing emitted code to make room for the new node. */
- while (src > insert_pos) {
- *--dst = *--src;
- }
- place = insert_pos; /* Where operand used to be. */
- *place++ = op; /* Inserted operand. */
- *place++ = '\0'; /* NEXT pointer for inserted operand. */
- *place++ = '\0';
- if (op == BRACE || op == LAZY_BRACE) {
- *place++ = PUT_OFFSET_L (min);
- *place++ = PUT_OFFSET_R (min);
- *place++ = PUT_OFFSET_L (max);
- *place++ = PUT_OFFSET_R (max);
- } else if (op == INIT_COUNT) {
- *place++ = (char32) index;
- }
- return place; /* Return a pointer to the start of the code moved. */
- }
- /*----------------------------------------------------------------------*
- * tail - Set the next-pointer at the end of a node chain.
- *----------------------------------------------------------------------*/
- static void tail (char32 *search_from, char32 *point_to) {
- char32 *scan;
- char32 *next;
- int offset;
- if (search_from == &Compute_Size) {
- return;
- }
- /* Find the last node in the chain (node with a null NEXT pointer) */
- scan = search_from;
- for (;;) {
- next = next_ptr (scan);
- if (!next) {
- break;
- }
- scan = next;
- }
- if (GET_OP_CODE (scan) == BACK) {
- offset = scan - point_to;
- } else {
- offset = point_to - scan;
- }
- /* Set NEXT pointer */
- * (scan + 1) = PUT_OFFSET_L (offset);
- * (scan + 2) = PUT_OFFSET_R (offset);
- }
- /*--------------------------------------------------------------------*
- * offset_tail
- *
- * Perform a tail operation on (ptr + offset).
- *--------------------------------------------------------------------*/
- static void offset_tail (char32 *ptr, int offset, char32 *val) {
- if (ptr == & Compute_Size || ! ptr)
- return;
- tail (ptr + offset, val);
- }
- /*--------------------------------------------------------------------*
- * branch_tail
- *
- * Perform a tail operation on (ptr + offset) but only if `ptr' is a
- * BRANCH node.
- *--------------------------------------------------------------------*/
- static void branch_tail (char32 *ptr, int offset, char32 *val) {
- if (ptr == & Compute_Size || ! ptr || GET_OP_CODE (ptr) != BRANCH)
- return;
- tail (ptr + offset, val);
- }
- /*--------------------------------------------------------------------*
- * shortcut_escape
- *
- * Implements convenient escape sequences that represent entire
- * character classes or special location assertions (similar to escapes
- * supported by Perl)
- * _
- * \d Digits [0-9] |
- * \D NOT a digit [^0-9] | (Examples
- * \l Letters [a-zA-Z] | at left
- * \L NOT a Letter [^a-zA-Z] | are
- * \s Whitespace [ \t\n\r\f\v] | for
- * \S NOT Whitespace [^ \t\n\r\f\v] | ASCII)
- * \w "Word" character [a-zA-Z0-9_] |
- * \W NOT a "Word" character [^a-zA-Z0-9_] _|
- *
- * \B Matches any character that is NOT a word-delimiter
- *
- * Codes for the "emit" parameter:
- *
- * EMIT_NODE
- * Emit a shortcut node. Shortcut nodes have an implied set of
- * class characters. This helps keep the compiled regex string
- * small.
- *
- * CHECK_ESCAPE
- * Only verify that this is a valid shortcut escape.
- *
- * CHECK_CLASS_ESCAPE
- * Same as CHECK_ESCAPE but only allows characters valid within
- * a klas.
- *
- *--------------------------------------------------------------------*/
- static char32 *shortcut_escape (
- char32 c,
- int *flag_param,
- int emit) {
- char32 *klas = NULL;
- static const conststring32 codes = U"ByYwWdDlLsS";
- char32 *ret_val = (char32 *) 1; // assume success
- conststring32 valid_codes;
- if (emit == CHECK_CLASS_ESCAPE) {
- valid_codes = codes + 5; /* \B, \y, \Y, \w and \W are not allowed in classes */
- } else {
- valid_codes = codes;
- }
- if (! str32chr (valid_codes, c)) {
- return nullptr; // not a valid shortcut escape sequence
- } else if (emit == CHECK_ESCAPE || emit == CHECK_CLASS_ESCAPE) {
- return ret_val; // just checking if this is a valid shortcut escape
- }
- switch (c) {
- case U'd':
- case U'D':
- if (emit == EMIT_NODE)
- ret_val = (iswlower ((int) c) ? emit_node (DIGIT) : emit_node (NOT_DIGIT));
- break;
- case U'l':
- case U'L':
- if (emit == EMIT_NODE)
- ret_val = (iswlower ((int) c) ? emit_node (LETTER) : emit_node (NOT_LETTER));
- break;
- case U's':
- case U'S':
- if (emit == EMIT_NODE) {
- if (Match_Newline) {
- ret_val = (iswlower ((int) c) ? emit_node (SPACE_NL) : emit_node (NOT_SPACE_NL));
- } else {
- ret_val = (iswlower ((int) c) ? emit_node (SPACE) : emit_node (NOT_SPACE));
- }
- }
- break;
- case U'w':
- case U'W':
- if (emit == EMIT_NODE) {
- ret_val = (iswlower ((int) c) ? emit_node (WORD_CHAR) : emit_node (NOT_WORD_CHAR));
- } else {
- REG_FAIL (U"internal error #105 `shortcut_escape\'");
- }
- break;
- /*
- Since the delimiter table is not available at regex compile time,
- \B, \y and \Y can only generate a node. At run time, the delimiter table
- will be available for these nodes to use.
- */
- case 'y':
- if (emit == EMIT_NODE) {
- ret_val = emit_node (IS_DELIM);
- } else {
- REG_FAIL (U"internal error #5 `shortcut_escape\'");
- }
- break;
- case 'Y':
- if (emit == EMIT_NODE) {
- ret_val = emit_node (NOT_DELIM);
- } else {
- REG_FAIL (U"internal error #6 `shortcut_escape\'");
- }
- break;
- case 'B':
- if (emit == EMIT_NODE) {
- ret_val = emit_node (NOT_BOUNDARY);
- } else {
- REG_FAIL (U"internal error #7 `shortcut_escape\'");
- }
- break;
- default:
- /*
- We get here if there isn't a case for every character in the string "codes".
- */
- REG_FAIL (U"internal error #8 `shortcut_escape\'");
- }
- if (emit == EMIT_NODE && c != 'B') {
- *flag_param |= (HAS_WIDTH | SIMPLE);
- }
- if (klas) {
- /*
- Emit bytes within a character class operand.
- */
- while (*klas != U'\0')
- emit_byte (*klas ++);
- }
- return ret_val;
- }
- /*--------------------------------------------------------------------*
- * numeric_escape
- *
- * Implements hex and octal numeric escape sequence syntax.
- *
- * Hexadecimal Escape: \x## Max of two digits Must have leading 'x'.
- * Octal Escape: \0### Max of three digits and not greater
- * than 377 octal. Must have leading zero.
- *
- * Returns the actual character value or NULL if not a valid hex or
- * octal escape. REG_FAIL is called if \x0, \x00, \0, \00, \000, or
- * \0000 is specified.
- *--------------------------------------------------------------------*/
- static char32 numeric_escape (
- char32 c,
- char32 **parse) {
- static char32 digits [] = { 'f', 'e', 'd', 'c', 'b', 'a', 'F', 'E', 'D', 'C', 'B', 'A', '9', '8', '7', '6', '5', '4', '3', '2', '1', '0', '\0' };
- static unsigned int digit_val [] = {
- 15, 14, 13, 12, 11, 10, /* Lower case Hex digits */
- 15, 14, 13, 12, 11, 10, /* Upper case Hex digits */
- 9, 8, 7, 6, 5, 4, 3, 2, 1, 0
- }; /* Decimal Digits */
- char32 *scan;
- char32 *pos_ptr;
- char32 *digit_str;
- unsigned int value = 0;
- unsigned int radix = 8;
- int width = 3; /* Can not be bigger than \0377 */
- int pos_delta = 14;
- int i, pos;
- switch (c) {
- case U'0':
- digit_str = digits + pos_delta; /* Only use Octal digits, i.e. 0-7. */
- break;
- case U'x':
- case U'X':
- width = 2; /* Can not be bigger than \0377 */
- radix = 16;
- pos_delta = 0;
- digit_str = digits; /* Use all of the digit characters. */
- break;
- default:
- return U'\0'; /* Not a numeric escape */
- }
- scan = *parse; scan ++; /* Only change *parse on success. */
- pos_ptr = str32chr (digit_str, *scan);
- for (i = 0; pos_ptr && i < width; i++) {
- pos = (pos_ptr - digit_str) + pos_delta;
- value = (value * radix) + digit_val [pos];
- /* If this digit makes the value over 255, treat this digit as a literal
- character instead of part of the numeric escape. For example, \0777
- will be processed as \077 (an 'M') and a literal '7' character, NOT
- 511 decimal which is > 255. */
- if (value > 255) {
- /* Back out calculations for last digit processed. */
- value -= digit_val [pos];
- value /= radix;
- break; /* Note that scan will not be incremented and still points to
- the digit that caused overflow. It will be decremented by
- the "else" below to point to the last character that is
- considered to be part of the octal escape. */
- }
- scan ++;
- pos_ptr = str32chr (digit_str, *scan);
- }
- /* Handle the case of "\0" i.e. trying to specify a NULL character. */
- if (value == 0) {
- if (c == U'0') {
- Melder_sprint (Error_Text,128, U"\\00 is an invalid octal escape");
- } else {
- Melder_sprint (Error_Text,128, U"\\", c, U"0 is an invalid hexadecimal escape");
- }
- } else {
- /* Point to the last character of the number on success. */
- scan --;
- *parse = scan;
- }
- return value;
- }
- /*--------------------------------------------------------------------*
- * literal_escape
- *
- * Recognize escaped literal characters (prefixed with backslash),
- * and translate them into the corresponding character.
- *
- * Returns the proper character value or NULL if not a valid literal
- * escape.
- *--------------------------------------------------------------------*/
- static char32 literal_escape (char32 c) {
- static char32 valid_escape [] = {
- 'a', 'b',
- 'e',
- 'f', 'n', 'r', 't', 'v', '(', ')', '-', '[', ']',
- '<', '>', '{', '}', '.', '\\', '|', '^', '$', '*',
- '+', '?', '&', '\0'
- };
- static char32 value [] = {
- '\a', '\b',
- 0x1B, /* Escape character in Unicode character set. */
- '\f', '\n', '\r', '\t', '\v', '(', ')', '-', '[', ']',
- '<', '>', '{', '}', '.', '\\', '|', '^', '$', '*',
- '+', '?', '&', '\0'
- };
- for (int i = 0; valid_escape [i] != '\0'; i++) {
- if (c == valid_escape [i]) {
- return value [i];
- }
- }
- return '\0';
- }
- /*--------------------------------------------------------------------*
- * back_ref
- *
- * Process a request to match a previous parenthesized thing.
- * Parenthetical entities are numbered beginning at 1 by counting
- * opening parentheses from left to to right. \0 would represent
- * whole match, but would confuse numeric_escape as an octal escape,
- * so it is forbidden.
- *
- * Constructs of the form \~1, \~2, etc. are cross-regex back
- * references and are used in syntax highlighting patterns to match
- * text previously matched by another regex. *** IMPLEMENT LATER ***
- *--------------------------------------------------------------------*/
- static char32 *back_ref (
- char32 *c,
- int *flag_param,
- int emit) {
- int paren_no, c_offset = 0, is_cross_regex = 0;
- char32 *ret_val;
- /* Implement cross regex backreferences later. */
- /* if (*c == (regularExp_CHAR) ('~')) {
- c_offset++;
- is_cross_regex++;
- } */
- paren_no = (int) (* (c + c_offset) - (char32) ('0'));
- if (! Melder_isAsciiDecimalNumber (* (c + c_offset)) || /* Only \1, \2, ... \9 are supported. */
- paren_no == 0) { /* Should be caught by numeric_escape. */
- return NULL;
- }
- /* Make sure parentheses for requested back-reference are complete. */
- if (! is_cross_regex && ! TEST_BIT (Closed_Parens, paren_no)) {
- Melder_sprint (Error_Text,128, U"\\", paren_no, U" is an illegal back reference");
- return NULL;
- }
- if (emit == EMIT_NODE) {
- if (is_cross_regex) {
- Reg_Parse ++; /* Skip past the '~' in a cross regex back reference.
- We only do this if we are emitting code. */
- if (Is_Case_Insensitive) {
- ret_val = emit_node (X_REGEX_BR_CI);
- } else {
- ret_val = emit_node (X_REGEX_BR);
- }
- } else {
- if (Is_Case_Insensitive) {
- ret_val = emit_node (BACK_REF_CI);
- } else {
- ret_val = emit_node (BACK_REF);
- }
- }
- emit_byte ((char32) paren_no);
- if (is_cross_regex || TEST_BIT (Paren_Has_Width, paren_no)) {
- *flag_param |= HAS_WIDTH;
- }
- } else if (emit == CHECK_ESCAPE) {
- ret_val = (char32 *) 1;
- } else {
- ret_val = nullptr;
- }
- return ret_val;
- }
- /*======================================================================*
- * Regex execution related code
- *======================================================================*/
- /* Global work variables for `ExecRE'. */
- static char32 *Reg_Input; /* String-input pointer. */
- static const char32 *Start_Of_String; /* Beginning of input, for ^ */
- /* and < checks. */
- static const char32 *End_Of_String; /* Logical end of input (if supplied, till \0 otherwise) */
- static const char32 *Look_Behind_To; /* Position till were look behind can safely check back */
- static char32 **Start_Ptr_Ptr; /* Pointer to `startp' array. */
- static char32 **End_Ptr_Ptr; /* Ditto for `endp'. */
- static char32 *Extent_Ptr_FW; /* Forward extent pointer */
- static char32 *Extent_Ptr_BW; /* Backward extent pointer */
- static char32 *Back_Ref_Start [10]; /* Back_Ref_Start [0] and */
- static char32 *Back_Ref_End [10]; /* Back_Ref_End [0] are not */
- /* used. This simplifies */
- /* indexing. */
- /*
- * Measured recursion limits:
- * Linux: +/- 40 000 (up to 110 000)
- * Solaris: +/- 85 000
- * HP-UX 11: +/- 325 000
- *
- * So 10 000 ought to be safe.
- */
- #define REGEX_RECURSION_LIMIT 10000
- static int Recursion_Count; /* Recursion counter */
- static int Recursion_Limit_Exceeded; /* Recursion limit exceeded flag */
- #define AT_END_OF_STRING(X) (*(X) == U'\0' || (End_Of_String != NULL && (X) >= End_Of_String))
- /* static regexp *Cross_Regex_Backref; */
- static bool Prev_Is_BOL;
- static bool Succ_Is_EOL;
- static bool Prev_Is_Delim;
- static bool Succ_Is_Delim;
- /* Define a pointer to an array to hold general (...){m,n} counts. */
- typedef struct brace_counts {
- unsigned long count [1]; /* More unwarranted chumminess with compiler. */
- } brace_counts;
- static struct brace_counts *Brace;
- /* Forward declarations of functions used by `ExecRE' */
- static int attempt (regexp *, char32 *);
- static int match (char32 *, int *);
- static unsigned long greedy (char32 *, long);
- static void adjustcase (char32 *, int, char32);
- /*
- * ExecRE - match a `regexp' structure against a string
- *
- * If `end' is non-NULL, matches may not BEGIN past end, but may extend past
- * it. If reverse is true, `end' should be specified, and searching begins at
- * `end'. "isbol" should be set to true if the beginning of the string is the
- * actual beginning of a line (since `ExecRE' can't look backwards from the
- * beginning to find whether there was a newline before). Likewise, "isbow"
- * asks whether the string is preceded by a word delimiter. End of string is
- * always treated as a word and line boundary (there may be cases where it
- * shouldn't be, in which case, this should be changed).
- * Look_behind_to indicates the position till where it is safe to
- * perform look-behind matches. If set, it should be smaller than or equal
- * to the start position of the search (pointed at by string). If it is NULL,
- * it defaults to the start position.
- * Finally, match_to indicates the logical end of the string, till where
- * matches are allowed to extend. Note that look-ahead patterns may look
- * past that boundary. If match_to is set to NULL, the terminating \0 is
- * assumed to correspond to the logical boundary. Match_to, if set, should be
- * larger than or equal to end, if set.
- */
- int ExecRE (
- regexp *prog,
- regexp *cross_regex_backref,
- conststring32 string,
- const char32 *end,
- int reverse,
- char32 prev_char,
- char32 succ_char,
- conststring32 look_behind_to,
- conststring32 match_to) {
- char32 *str;
- char32 **s_ptr;
- char32 **e_ptr;
- int ret_val = 0;
- int i;
- (void) cross_regex_backref;
- s_ptr = (char32 **) prog->startp;
- e_ptr = (char32 **) prog->endp;
- /* Check for valid parameters. */
- if (! prog || ! string) {
- reg_error (U"NULL parameter to `ExecRE\'");
- goto SINGLE_RETURN;
- }
- /* Check validity of program. */
- if (U_CHAR_AT (prog->program) != MAGIC) {
- reg_error (U"corrupted program");
- goto SINGLE_RETURN;
- }
- /* Remember the logical end of the string. */
- End_Of_String = match_to;
- if (reverse && ! end) {
- for (end = string; ! AT_END_OF_STRING (end); end ++) { }
- succ_char = U'\n';
- } else if (! end) {
- succ_char = U'\n';
- }
- /* Remember the beginning of the string for matching BOL */
- Start_Of_String = string;
- Look_Behind_To = look_behind_to ? look_behind_to : string;
- Prev_Is_BOL = ( prev_char == U'\n' || prev_char == U'\0' );
- Succ_Is_EOL = ( succ_char == U'\n' || succ_char == U'\0' );
- Prev_Is_Delim = ! Melder_isWordCharacter (prev_char);
- Succ_Is_Delim = ! Melder_isWordCharacter (succ_char);
- Total_Paren = (int) (prog->program [1]);
- Num_Braces = (int) (prog->program [2]);
- /* Reset the recursion detection flag */
- Recursion_Limit_Exceeded = 0;
- /* Cross_Regex_Backref = cross_regex_backref; */
- /* Allocate memory for {m,n} construct counting variables if need be. */
- if (Num_Braces > 0) {
- Brace =
- (brace_counts *) malloc (sizeof (brace_counts) * (size_t) Num_Braces);
- if (Brace == NULL) {
- reg_error (U"out of memory in `ExecRE\'");
- goto SINGLE_RETURN;
- }
- } else {
- Brace = NULL;
- }
- /* Initialize the first nine (9) capturing parentheses start and end
- pointers to point to the start of the search string. This is to prevent
- crashes when later trying to reference captured parens that do not exist
- in the compiled regex. We only need to do the first nine since users
- can only specify \1, \2, ... \9. */
- for (i = 9; i > 0; i--) {
- *s_ptr++ = (char32 *) string;
- *e_ptr++ = (char32 *) string;
- }
- if (!reverse) { /* Forward Search */
- if (prog->anchor) {
- /* Search is anchored at BOL */
- if (attempt (prog, (char32 *) string)) {
- ret_val = 1;
- goto SINGLE_RETURN;
- }
- for (str = (char32 *) string;
- !AT_END_OF_STRING (str) && str != (char32 *) end && !Recursion_Limit_Exceeded;
- str++) {
- if (*str == '\n') {
- if (attempt (prog, str + 1)) {
- ret_val = 1;
- break;
- }
- }
- }
- goto SINGLE_RETURN;
- } else if (prog->match_start != '\0') {
- /* We know what char match must start with. */
- for (str = (char32 *) string;
- !AT_END_OF_STRING (str) && str != (char32 *) end && !Recursion_Limit_Exceeded;
- str++) {
- if (*str == (char32) prog->match_start) {
- if (attempt (prog, str)) {
- ret_val = 1;
- break;
- }
- }
- }
- goto SINGLE_RETURN;
- } else {
- /* General case */
- for (str = (char32 *) string;
- !AT_END_OF_STRING (str) && str != (char32 *) end && !Recursion_Limit_Exceeded;
- str++) {
- if (attempt (prog, str)) {
- ret_val = 1;
- break;
- }
- }
- /* Beware of a single $ matching \0 */
- if (!Recursion_Limit_Exceeded && !ret_val && AT_END_OF_STRING (str) && str != (char32 *) end) {
- if (attempt (prog, str)) {
- ret_val = 1;
- }
- }
- goto SINGLE_RETURN;
- }
- } else { /* Search reverse, same as forward, but loops run backward */
- /* Make sure that we don't start matching beyond the logical end */
- if (End_Of_String != NULL && (char32 *) end > End_Of_String) {
- end = (const char32 *) End_Of_String;
- }
- if (prog->anchor) {
- /* Search is anchored at BOL */
- for (str = (char32 *) (end - 1); str >= (char32 *) string && !Recursion_Limit_Exceeded; str--) {
- if (*str == '\n') {
- if (attempt (prog, str + 1)) {
- ret_val = 1;
- goto SINGLE_RETURN;
- }
- }
- }
- if (!Recursion_Limit_Exceeded && attempt (prog, (char32 *) string)) {
- ret_val = 1;
- goto SINGLE_RETURN;
- }
- goto SINGLE_RETURN;
- } else if (prog->match_start != '\0') {
- /* We know what char match must start with. */
- for (str = (char32 *) end; str >= (char32 *) string && !Recursion_Limit_Exceeded; str--) {
- if (*str == (char32) prog->match_start) {
- if (attempt (prog, str)) {
- ret_val = 1;
- break;
- }
- }
- }
- goto SINGLE_RETURN;
- } else {
- /* General case */
- for (str = (char32 *) end; str >= (char32 *) string && !Recursion_Limit_Exceeded; str--) {
- if (attempt (prog, str)) {
- ret_val = 1;
- break;
- }
- }
- }
- }
- SINGLE_RETURN: if (Brace) {
- free (Brace);
- }
- if (Recursion_Limit_Exceeded) {
- return (0);
- }
- return (ret_val);
- }
- static int attempt (regexp *prog, char32 *string) {
- int i;
- char32 **s_ptr;
- char32 **e_ptr;
- int branch_index = 0; /* Must be set to zero ! */
- Reg_Input = string;
- Start_Ptr_Ptr = (char32 **) prog->startp;
- End_Ptr_Ptr = (char32 **) prog->endp;
- s_ptr = (char32 **) prog->startp;
- e_ptr = (char32 **) prog->endp;
- /* Reset the recursion counter. */
- Recursion_Count = 0;
- /* Overhead due to capturing parentheses. */
- Extent_Ptr_BW = string;
- Extent_Ptr_FW = nullptr;
- for (i = Total_Paren + 1; i > 0; i--) {
- *s_ptr ++ = nullptr;
- *e_ptr ++ = nullptr;
- }
- if (match ( (char32 *) (prog->program + REGEX_START_OFFSET),
- &branch_index)) {
- prog->startp [0] = (char32 *) string;
- prog->endp [0] = (char32 *) Reg_Input; /* <-- One char AFTER */
- prog->extentpBW = (char32 *) Extent_Ptr_BW; /* matched string! */
- prog->extentpFW = (char32 *) Extent_Ptr_FW;
- prog->top_branch = branch_index;
- return 1;
- } else {
- return 0;
- }
- }
- /*----------------------------------------------------------------------*
- * match - main matching routine
- *
- * Conceptually the strategy is simple: check to see whether the
- * current node matches, call self recursively to see whether the rest
- * matches, and then act accordingly. In practice we make some effort
- * to avoid recursion, in particular by going through "ordinary" nodes
- * (that don't need to know whether the rest of the match failed) by a
- * loop instead of by recursion. Returns 0 failure, 1 success.
- *----------------------------------------------------------------------*/
- #define MATCH_RETURN(X)\
- { --Recursion_Count; return (X); }
- #define CHECK_RECURSION_LIMIT\
- if (Recursion_Limit_Exceeded) MATCH_RETURN (0);
- static int match (char32 *prog, int *branch_index_param) {
- char32 *scan; /* Current node. */
- char32 *next; /* Next node. */
- int next_ptr_offset; /* Used by the NEXT_PTR () macro */
- if (++ Recursion_Count > REGEX_RECURSION_LIMIT) {
- if (! Recursion_Limit_Exceeded) { /* Prevent duplicate errors */
- reg_error (U"recursion limit exceeded, please respecify expression");
- }
- Recursion_Limit_Exceeded = 1;
- MATCH_RETURN (0)
- }
- scan = prog;
- while (scan) {
- NEXT_PTR (scan, next);
- switch (GET_OP_CODE (scan)) {
- case BRANCH: {
- char32 *save;
- int branch_index_local = 0;
- if (GET_OP_CODE (next) != BRANCH) { /* No choice. */
- next = OPERAND (scan); /* Avoid recursion. */
- } else {
- do {
- save = Reg_Input;
- if (match (OPERAND (scan), NULL)) {
- if (branch_index_param)
- *branch_index_param = branch_index_local;
- MATCH_RETURN (1)
- }
- CHECK_RECURSION_LIMIT
- ++branch_index_local;
- Reg_Input = save; /* Backtrack. */
- NEXT_PTR (scan, scan);
- } while (scan != NULL && GET_OP_CODE (scan) == BRANCH);
- MATCH_RETURN (0) // NOT REACHED
- }
- }
- break;
- case EXACTLY: {
- char32 *opnd = OPERAND (scan);
- /* Inline the first character, for speed. */
- if (*opnd != *Reg_Input)
- MATCH_RETURN (0)
- int len = str32len (opnd);
- if (End_Of_String != NULL && Reg_Input + len > End_Of_String)
- MATCH_RETURN (0)
- if (len > 1 && str32ncmp (opnd, Reg_Input, len) != 0)
- MATCH_RETURN (0)
- Reg_Input += len;
- }
- break;
- case SIMILAR: {
- char32 *opnd = OPERAND (scan);
- char32 test;
- /* Note: the SIMILAR operand was converted to lower case during
- regex compile. */
- while ((test = *opnd++) != U'\0') {
- if (AT_END_OF_STRING (Reg_Input) || Melder_toLowerCase (*Reg_Input++) != test)
- MATCH_RETURN (0)
- }
- }
- break;
- case BOL: /* `^' (beginning of line anchor) */
- if (Reg_Input == Start_Of_String) {
- if (Prev_Is_BOL)
- break;
- } else if (* (Reg_Input - 1) == U'\n') {
- break;
- }
- MATCH_RETURN (0)
- case EOL: /* `$' anchor matches end of line and end of string */
- if (*Reg_Input == U'\n' || (AT_END_OF_STRING (Reg_Input) && Succ_Is_EOL))
- break;
- MATCH_RETURN (0)
- case BOWORD: /* `<' (beginning of word anchor) */
- /* Check to see if the current character is not a delimiter
- and the preceding character is. */
- {
- bool prev_is_delim;
- if (Reg_Input == Start_Of_String) {
- prev_is_delim = Prev_Is_Delim;
- } else {
- prev_is_delim = ! Melder_isWordCharacter (*(Reg_Input - 1));
- }
- if (prev_is_delim) {
- bool current_is_delim;
- if (AT_END_OF_STRING (Reg_Input)) {
- current_is_delim = Succ_Is_Delim;
- } else {
- current_is_delim = ! Melder_isWordCharacter (*Reg_Input);
- }
- if (! current_is_delim)
- break;
- }
- }
- MATCH_RETURN (0)
- case EOWORD: /* `>' (end of word anchor) */
- /* Check to see if the current character is a delimiter
- and the preceding character is not. */
- {
- bool prev_is_delim;
- if (Reg_Input == Start_Of_String) {
- prev_is_delim = Prev_Is_Delim;
- } else {
- prev_is_delim = ! Melder_isWordCharacter (*(Reg_Input - 1));
- }
- if (! prev_is_delim) {
- bool current_is_delim;
- if (AT_END_OF_STRING (Reg_Input)) {
- current_is_delim = Succ_Is_Delim;
- } else {
- current_is_delim = ! Melder_isWordCharacter (*Reg_Input);
- }
- if (current_is_delim)
- break;
- }
- }
- MATCH_RETURN (0)
- case NOT_BOUNDARY: { /* \B (NOT a word boundary) */
- bool prev_is_delim;
- bool current_is_delim;
- if (Reg_Input == Start_Of_String) {
- prev_is_delim = Prev_Is_Delim;
- } else {
- prev_is_delim = ! Melder_isWordCharacter (*(Reg_Input - 1));
- }
- if (AT_END_OF_STRING (Reg_Input)) {
- current_is_delim = Succ_Is_Delim;
- } else {
- current_is_delim = ! Melder_isWordCharacter (*Reg_Input);
- }
- if (! (prev_is_delim ^ current_is_delim))
- break;
- }
- MATCH_RETURN (0)
- case IS_DELIM: /* \y (A word delimiter character.) */
- if (! Melder_isWordCharacter (*Reg_Input) && ! AT_END_OF_STRING (Reg_Input)) {
- Reg_Input++; break;
- }
- MATCH_RETURN (0)
- case NOT_DELIM: /* \Y (NOT a word delimiter character.) */
- if (Melder_isWordCharacter (*Reg_Input) && ! AT_END_OF_STRING (Reg_Input)) {
- Reg_Input ++;
- break;
- }
- MATCH_RETURN (0)
- case WORD_CHAR: /* \w (word character; alpha-numeric or underscore) */
- if (Melder_isWordCharacter (*Reg_Input) && ! AT_END_OF_STRING (Reg_Input)) {
- Reg_Input ++;
- break;
- }
- MATCH_RETURN (0)
- case NOT_WORD_CHAR:/* \W (NOT a word character) */
- if (Melder_isWordCharacter (*Reg_Input) || *Reg_Input == U'\n' || AT_END_OF_STRING (Reg_Input))
- MATCH_RETURN (0)
- Reg_Input ++;
- break;
- case ANY: /* `.' (matches any character EXCEPT newline) */
- if (AT_END_OF_STRING (Reg_Input) || *Reg_Input == U'\n')
- MATCH_RETURN (0)
- Reg_Input ++;
- break;
- case EVERY: /* `.' (matches any character INCLUDING newline) */
- if (AT_END_OF_STRING (Reg_Input))
- MATCH_RETURN (0)
- Reg_Input ++;
- break;
- case DIGIT: /* \d; for ASCII, use [0-9] */
- if (! Melder_isDecimalNumber (*Reg_Input) || AT_END_OF_STRING (Reg_Input))
- MATCH_RETURN (0)
- Reg_Input ++;
- break;
- case NOT_DIGIT: /* \D; for ASCII, use [^0-9] */
- if (Melder_isDecimalNumber (*Reg_Input) || *Reg_Input == U'\n' || AT_END_OF_STRING (Reg_Input))
- MATCH_RETURN (0)
- Reg_Input ++;
- break;
- case LETTER: /* \l; for ASCII, use [a-zA-Z] */
- if (! Melder_isLetter (*Reg_Input) || AT_END_OF_STRING (Reg_Input))
- MATCH_RETURN (0)
- Reg_Input ++;
- break;
- case NOT_LETTER: /* \L; for ASCII, use [^a-zA-Z] */
- if (Melder_isLetter (*Reg_Input) || *Reg_Input == '\n' || AT_END_OF_STRING (Reg_Input))
- MATCH_RETURN (0)
- Reg_Input ++;
- break;
- case SPACE: /* \s; for ASCII, use [ \t] */
- if (! Melder_isHorizontalSpace (*Reg_Input) || AT_END_OF_STRING (Reg_Input))
- MATCH_RETURN (0)
- Reg_Input ++;
- break;
- case SPACE_NL: /* \s; for ASCII, use [\n \t\r\f\v] */
- if (! Melder_isHorizontalOrVerticalSpace (*Reg_Input) || AT_END_OF_STRING (Reg_Input))
- MATCH_RETURN (0)
- Reg_Input ++;
- break;
- case NOT_SPACE: /* \S; for ASCII, use [^\n \t\r\f\v] */
- if (Melder_isHorizontalOrVerticalSpace (*Reg_Input) || AT_END_OF_STRING (Reg_Input))
- MATCH_RETURN (0)
- Reg_Input ++;
- break;
- case NOT_SPACE_NL: /* \S; for ASCII, use [^ \t\r\f\v] */
- if (Melder_isHorizontalSpace (*Reg_Input) || AT_END_OF_STRING (Reg_Input))
- MATCH_RETURN (0)
- Reg_Input ++;
- break;
- case ANY_OF: /* [...] character class. */
- if (AT_END_OF_STRING (Reg_Input))
- MATCH_RETURN (0)
- /* Needed because strchr ()
- considers \0 as a member
- of the character set. */
- if (! str32chr (OPERAND (scan), *Reg_Input))
- MATCH_RETURN (0)
- Reg_Input ++;
- break;
- case ANY_BUT: /* [^...] Negated character class-- does NOT normally
- match newline (\n added usually to operand at compile
- time.) */
- if (AT_END_OF_STRING (Reg_Input)) {
- MATCH_RETURN (0); /* See comment for ANY_OF. */
- }
- if (str32chr (OPERAND (scan), *Reg_Input) != nullptr)
- MATCH_RETURN (0)
- Reg_Input ++;
- break;
- case NOTHING:
- case BACK:
- break;
- case STAR:
- case PLUS:
- case QUESTION:
- case BRACE:
- case LAZY_STAR:
- case LAZY_PLUS:
- case LAZY_QUESTION:
- case LAZY_BRACE: {
- unsigned long num_matched = REG_ZERO;
- unsigned long min = ULONG_MAX, max = REG_ZERO;
- char32 *save;
- char32 next_char;
- char32 *next_op;
- int lazy = 0;
- /* Lookahead (when possible) to avoid useless match attempts
- when we know what character comes next. */
- if (GET_OP_CODE (next) == EXACTLY) {
- next_char = *OPERAND (next);
- } else {
- next_char = U'\0';/* i.e. Don't know what next character is. */
- }
- next_op = OPERAND (scan);
- switch (GET_OP_CODE (scan)) {
- case LAZY_STAR:
- lazy = 1;
- case STAR:
- min = REG_ZERO;
- max = ULONG_MAX;
- break;
- case LAZY_PLUS:
- lazy = 1;
- case PLUS:
- min = REG_ONE;
- max = ULONG_MAX;
- break;
- case LAZY_QUESTION:
- lazy = 1;
- case QUESTION:
- min = REG_ZERO;
- max = REG_ONE;
- break;
- case LAZY_BRACE:
- lazy = 1;
- case BRACE:
- min = (unsigned long)
- GET_OFFSET (scan + NEXT_PTR_SIZE);
- max = (unsigned long)
- GET_OFFSET (scan + (2 * NEXT_PTR_SIZE));
- if (max <= REG_INFINITY) {
- max = ULONG_MAX;
- }
- next_op = OPERAND (scan + (2 * NEXT_PTR_SIZE));
- }
- save = Reg_Input;
- if (lazy) {
- if (min > REG_ZERO) {
- num_matched = greedy (next_op, min);
- }
- } else {
- num_matched = greedy (next_op, max);
- }
- while (min <= num_matched && num_matched <= max) {
- if (next_char == U'\0' || next_char == *Reg_Input) {
- if (match (next, NULL)) {
- MATCH_RETURN (1);
- }
- CHECK_RECURSION_LIMIT
- }
- /* Couldn't or didn't match. */
- if (lazy) {
- if (!greedy (next_op, 1)) {
- MATCH_RETURN (0);
- }
- num_matched ++; /* Inch forward. */
- } else if (num_matched > REG_ZERO) {
- num_matched --; /* Back up. */
- } else if (min == REG_ZERO && num_matched == REG_ZERO) {
- break;
- }
- Reg_Input = save + num_matched;
- }
- MATCH_RETURN (0);
- }
- break;
- case END:
- if (Extent_Ptr_FW == NULL || (Reg_Input - Extent_Ptr_FW) > 0) {
- Extent_Ptr_FW = Reg_Input;
- }
- MATCH_RETURN (1) // Success!
- break;
- case INIT_COUNT:
- Brace->count [*OPERAND (scan)] = REG_ZERO;
- break;
- case INC_COUNT:
- Brace->count [*OPERAND (scan)] ++;
- break;
- case TEST_COUNT:
- if (Brace->count [*OPERAND (scan)] <
- (unsigned long) GET_OFFSET (scan + NEXT_PTR_SIZE + INDEX_SIZE)) {
- next = scan + NODE_SIZE + INDEX_SIZE + NEXT_PTR_SIZE;
- }
- break;
- case BACK_REF:
- case BACK_REF_CI:
- /* case X_REGEX_BR: */
- /* case X_REGEX_BR_CI: *** IMPLEMENT LATER */
- {
- char32 *captured, *finish;
- int paren_no;
- paren_no = (int) * OPERAND (scan);
- /* if (GET_OP_CODE (scan) == X_REGEX_BR ||
- GET_OP_CODE (scan) == X_REGEX_BR_CI) {
- if (Cross_Regex_Backref == NULL) MATCH_RETURN (0);
- captured =
- (regularExp_CHAR *) Cross_Regex_Backref->startp [paren_no];
- finish =
- (regularExp_CHAR *) Cross_Regex_Backref->endp [paren_no];
- } else { */
- captured = Back_Ref_Start [paren_no];
- finish = Back_Ref_End [paren_no];
- /* } */
- if (captured && finish) {
- if (captured > finish) {
- MATCH_RETURN (0);
- }
- if (GET_OP_CODE (scan) == BACK_REF_CI /* ||
- GET_OP_CODE (scan) == X_REGEX_BR_CI*/) {
- while (captured < finish) {
- if (AT_END_OF_STRING (Reg_Input) || Melder_toLowerCase (*captured++) != Melder_toLowerCase (*Reg_Input++)) // TODO: should casefold
- MATCH_RETURN (0)
- }
- } else {
- while (captured < finish) {
- if (AT_END_OF_STRING (Reg_Input) || *captured++ != *Reg_Input++)
- MATCH_RETURN (0)
- }
- }
- break;
- } else {
- MATCH_RETURN (0)
- }
- }
- case POS_AHEAD_OPEN:
- case NEG_AHEAD_OPEN:
- {
- char32 *save = Reg_Input;
- /* Temporarily ignore the logical end of the string, to allow
- lookahead past the end. */
- const char32 *saved_end = End_Of_String;
- End_Of_String = NULL;
- int answer = match (next, NULL); /* Does the look-ahead regex match? */
- CHECK_RECURSION_LIMIT
- if ( (GET_OP_CODE (scan) == POS_AHEAD_OPEN) ? answer : ! answer) {
- /* Remember the last (most to the right) character position
- that we consume in the input for a successful match. This
- is info that may be needed should an attempt be made to
- match the exact same text at the exact same place. Since
- look-aheads backtrack, a regex with a trailing look-ahead
- may need more text than it matches to accomplish a
- re-match. */
- if (Extent_Ptr_FW == NULL || (Reg_Input - Extent_Ptr_FW) > 0) {
- Extent_Ptr_FW = Reg_Input;
- }
- Reg_Input = save; /* Backtrack to look-ahead start. */
- End_Of_String = saved_end; /* Restore logical end. */
- /* Jump to the node just after the (?=...) or (?!...)
- Construct. */
- next = next_ptr (OPERAND (scan)); /* Skip 1st branch */
- /* Skip the chain of branches inside the look-ahead */
- while (GET_OP_CODE (next) == BRANCH) {
- next = next_ptr (next);
- }
- next = next_ptr (next); /* Skip the LOOK_AHEAD_CLOSE */
- } else {
- Reg_Input = save; /* Backtrack to look-ahead start. */
- End_Of_String = saved_end; /* Restore logical end. */
- MATCH_RETURN (0)
- }
- }
- break;
- case POS_BEHIND_OPEN:
- case NEG_BEHIND_OPEN: {
- char32 *save;
- int answer;
- int offset, upper;
- int lower;
- int found = 0;
- const char32 *saved_end;
- save = Reg_Input;
- saved_end = End_Of_String;
- /* Prevent overshoot (greedy matching could end past the
- current position) by tightening the matching boundary.
- Lookahead inside lookbehind can still cross that boundary. */
- End_Of_String = Reg_Input;
- lower = GET_LOWER (scan);
- upper = GET_UPPER (scan);
- /* Start with the shortest match first. This is the most
- efficient direction in general.
- Note! Negative look behind is _very_ tricky when the length
- is not constant: we have to make sure the expression doesn't
- match for _any_ of the starting positions. */
- for (offset = lower; offset <= upper; ++offset) {
- Reg_Input = save - offset;
- if (Reg_Input < Look_Behind_To) {
- /* No need to look any further */
- break;
- }
- answer = match (next, NULL); /* Does the look-behind regex match? */
- CHECK_RECURSION_LIMIT
- /* The match must have ended at the current position;
- otherwise it is invalid */
- if (answer && Reg_Input == save) {
- /* It matched, exactly far enough */
- found = 1;
- /* Remember the last (most to the left) character position
- that we consume in the input for a successful match.
- This is info that may be needed should an attempt be
- made to match the exact same text at the exact same
- place. Since look-behind backtracks, a regex with a
- leading look-behind may need more text than it matches
- to accomplish a re-match. */
- if (Extent_Ptr_BW == NULL ||
- (Extent_Ptr_BW - (save - offset)) > 0) {
- Extent_Ptr_BW = save - offset;
- }
- break;
- }
- }
- /* Always restore the position and the logical string end. */
- Reg_Input = save;
- End_Of_String = saved_end;
- if ( (GET_OP_CODE (scan) == POS_BEHIND_OPEN) ? found : !found) {
- /* The look-behind matches, so we must jump to the next
- node. The look-behind node is followed by a chain of
- branches (contents of the look-behind expression), and
- terminated by a look-behind-close node. */
- next = next_ptr (OPERAND (scan) + LENGTH_SIZE); /* 1st branch */
- /* Skip the chained branches inside the look-ahead */
- while (GET_OP_CODE (next) == BRANCH) {
- next = next_ptr (next);
- }
- next = next_ptr (next); /* Skip LOOK_BEHIND_CLOSE */
- } else {
- /* Not a match */
- MATCH_RETURN (0)
- }
- }
- break;
- case LOOK_AHEAD_CLOSE:
- case LOOK_BEHIND_CLOSE:
- MATCH_RETURN (1); /* We have reached the end of the look-ahead or
- look-behind which implies that we matched it,
- so return `true`. */
- default:
- if ( (GET_OP_CODE (scan) > OPEN) &&
- (GET_OP_CODE (scan) < OPEN + NSUBEXP)) {
- int no;
- char32 *save;
- no = GET_OP_CODE (scan) - OPEN;
- save = Reg_Input;
- if (no < 10) {
- Back_Ref_Start [no] = save;
- Back_Ref_End [no] = NULL;
- }
- if (match (next, NULL)) {
- /* Do not set `Start_Ptr_Ptr' if some later invocation (think
- recursion) of the same parentheses already has. */
- if (Start_Ptr_Ptr [no] == NULL) {
- Start_Ptr_Ptr [no] = save;
- }
- MATCH_RETURN (1)
- } else {
- MATCH_RETURN (0)
- }
- } else if ( (GET_OP_CODE (scan) > CLOSE) &&
- (GET_OP_CODE (scan) < CLOSE + NSUBEXP)) {
- int no;
- char32 *save;
- no = GET_OP_CODE (scan) - CLOSE;
- save = Reg_Input;
- if (no < 10) {
- Back_Ref_End [no] = save;
- }
- if (match (next, NULL)) {
- /* Do not set `End_Ptr_Ptr' if some later invocation of the
- same parentheses already has. */
- if (End_Ptr_Ptr [no] == NULL) {
- End_Ptr_Ptr [no] = save;
- }
- MATCH_RETURN (1)
- } else {
- MATCH_RETURN (0)
- }
- } else {
- reg_error (U"memory corruption, `match\'");
- MATCH_RETURN (0)
- }
- break;
- }
- scan = next;
- }
- /* We get here only if there's trouble -- normally "case END" is
- the terminating point. */
- reg_error (U"corrupted pointers, `match\'");
- MATCH_RETURN (0)
- }
- /*----------------------------------------------------------------------*
- * greedy
- *
- * Repeatedly match something simple up to "max" times. If max <= 0
- * then match as much as possible (max = infinity). Uses unsigned long
- * variables to maximize the amount of text matchable for unbounded
- * qualifiers like '*' and '+'. This will allow at least 4,294,967,295
- * matches (4 Gig!) for an ANSI C compliant compiler. If you are
- * applying a regex to something bigger than that, you shouldn't be
- * using NEdit!
- *
- * Returns the actual number of matches.
- *----------------------------------------------------------------------*/
- static unsigned long greedy (char32 *p, long max) {
- char32 *input_str;
- char32 *operand;
- unsigned long count = REG_ZERO;
- unsigned long max_cmp;
- input_str = Reg_Input;
- operand = OPERAND (p); /* Literal char or start of class characters. */
- max_cmp = (max > 0) ? (unsigned long) max : ULONG_MAX;
- switch (GET_OP_CODE (p))
- {
- case ANY: /* Race to the end of the line or string. Dot DOESN'T match newline. */
- while (count < max_cmp && *input_str != '\n' && ! AT_END_OF_STRING (input_str)) {
- count ++;
- input_str ++;
- }
- break;
- case EVERY: /* Race to the end of the line or string. Dot DOES match newline. */
- while (count < max_cmp && ! AT_END_OF_STRING (input_str)) {
- count ++;
- input_str ++;
- }
- break;
- case EXACTLY: /* Count occurrences of single character operand. */
- while (count < max_cmp && *operand == *input_str && ! AT_END_OF_STRING (input_str)) {
- count ++;
- input_str ++;
- }
- break;
- case SIMILAR: /* Case-insensitive version of EXACTLY */
- while (count < max_cmp && *operand == Melder_toLowerCase (*input_str) && ! AT_END_OF_STRING (input_str)) {
- count ++;
- input_str ++;
- }
- break;
- case ANY_OF: /* [...] character class. */
- while (count < max_cmp && str32chr (operand, *input_str) != NULL && ! AT_END_OF_STRING (input_str)) {
- count ++;
- input_str ++;
- }
- break;
- case ANY_BUT: /* [^...] Negated character class- does NOT normally
- match newline (\n added usually to operand at compile
- time.) */
- while (count < max_cmp && str32chr (operand, *input_str) == NULL && ! AT_END_OF_STRING (input_str)) {
- count ++;
- input_str ++;
- }
- break;
- case IS_DELIM: /* \y (not a word delimiter char)
- NOTE: '\n' and '\0' are always word delimiters. */
- while (count < max_cmp && ! Melder_isWordCharacter (*input_str) && ! AT_END_OF_STRING (input_str)) {
- count ++;
- input_str ++;
- }
- break;
- case NOT_DELIM: /* \Y (not a word delimiter char)
- NOTE: '\n' and '\0' are always word delimiters. */
- while (count < max_cmp && Melder_isWordCharacter (*input_str) && ! AT_END_OF_STRING (input_str)) {
- count ++;
- input_str ++;
- }
- break;
- case WORD_CHAR: /* \w (a word character: letter, number, mark or connector punctuation) */
- while (count < max_cmp && Melder_isWordCharacter (*input_str) && ! AT_END_OF_STRING (input_str)) {
- count ++;
- input_str ++;
- }
- break;
- case NOT_WORD_CHAR: /* \W (NOT a word character) */
- while (count < max_cmp && ! Melder_isWordCharacter (*input_str) && *input_str != U'\n' && ! AT_END_OF_STRING (input_str)) {
- count ++;
- input_str ++;
- }
- break;
- case DIGIT: /* for ASCII, use [0-9] */
- while (count < max_cmp && Melder_isDecimalNumber (*input_str) && ! AT_END_OF_STRING (input_str)) {
- count ++;
- input_str ++;
- }
- break;
- case NOT_DIGIT: /* for ASCII, use [^0-9] */
- while (count < max_cmp && ! Melder_isDecimalNumber (*input_str) && *input_str != U'\n' && ! AT_END_OF_STRING (input_str)) {
- count ++;
- input_str ++;
- }
- break;
- case SPACE: /* for ASCII, use [ \t\r\f\v]-- doesn't match newline. */
- while (count < max_cmp && Melder_isHorizontalSpace (*input_str) && ! AT_END_OF_STRING (input_str)) {
- count ++;
- input_str ++;
- }
- break;
- case SPACE_NL: /* for ASCII, use [\n \t\r\f\v]-- matches newline. */
- while (count < max_cmp && Melder_isHorizontalOrVerticalSpace (*input_str) && ! AT_END_OF_STRING (input_str)) {
- count ++;
- input_str ++;
- }
- break;
- case NOT_SPACE: /* for ASCII, use [^\n \t\r\f\v]-- doesn't match newline. */
- while (count < max_cmp && ! Melder_isHorizontalOrVerticalSpace (*input_str) && ! AT_END_OF_STRING (input_str)) {
- count ++;
- input_str ++;
- }
- break;
- case NOT_SPACE_NL: /* for ASCII, use [^ \t\r\f\v]-- matches newline. */
- while (count < max_cmp && ! Melder_isHorizontalSpace (*input_str) && ! AT_END_OF_STRING (input_str)) {
- count ++;
- input_str ++;
- }
- break;
- case LETTER: /* for ASCII, use [a-zA-Z] */
- while (count < max_cmp && Melder_isLetter (*input_str) && ! AT_END_OF_STRING (input_str)) {
- count ++;
- input_str ++;
- }
- break;
- case NOT_LETTER: /* for ASCII, use [^a-zA-Z] */
- while (count < max_cmp && ! Melder_isLetter (*input_str) && *input_str != '\n' && ! AT_END_OF_STRING (input_str)) {
- count ++;
- input_str ++;
- }
- break;
- default:
- /* Called inappropriately. Only atoms that are SIMPLE should
- generate a call to greedy. The above cases should cover
- all the atoms that are SIMPLE. */
- reg_error (U"internal error #10 `greedy\'");
- count = 0U; /* Best we can do. */
- }
- /* Point to character just after last matched character. */
- Reg_Input = input_str;
- return (count);
- }
- /*----------------------------------------------------------------------*
- * next_ptr - compute the address of a node's "NEXT" pointer.
- * Note: a simplified inline version is available via the NEXT_PTR() macro,
- * but that one is only to be used at time-critical places (see the
- * description of the macro).
- *----------------------------------------------------------------------*/
- static char32 *next_ptr (char32 *ptr) {
- if (ptr == & Compute_Size)
- return nullptr;
- int offset = GET_OFFSET (ptr);
- if (offset == 0)
- return nullptr;
- if (GET_OP_CODE (ptr) == BACK) {
- return ptr - offset;
- } else {
- return ptr + offset;
- }
- }
- /*
- ** SubstituteRE - Perform substitutions after a `regexp' match.
- **
- ** This function cleanly shortens results of more than max length to max.
- ** To give the caller a chance to react to this the function returns False
- ** on any error. The substitution will still be executed.
- */
- int SubstituteRE (const regexp *prog, conststring32 source, mutablestring32 dest, int max, int *errorType) {
- const char32 *src;
- const char32 *src_alias;
- char32 *dst;
- char32 c;
- char32 test;
- int paren_no;
- int len;
- char32 chgcase;
- bool anyWarnings = false;
- *errorType = 0;
- if (! prog || ! source || ! dest) {
- reg_error (U"NULL parm to `SubstituteRE\'");
- *errorType = 2;
- return false;
- }
- if (U_CHAR_AT (prog->program) != MAGIC) {
- *errorType = 3;
- reg_error (U"damaged regexp passed to `SubstituteRE\'");
- return false;
- }
- src = source;
- dst = dest;
- while ( (c = *src++) != U'\0') {
- chgcase = U'\0';
- paren_no = -1;
- if (c == U'\\') {
- /* Process any case-altering tokens, i.e \u, \U, \l, \L. */
- if (*src == U'u' || *src == U'U' || *src == U'l' || *src == U'L') {
- chgcase = *src;
- src ++;
- c = *src ++;
- if (c == U'\0')
- break;
- }
- }
- if (c == U'&') {
- paren_no = 0;
- } else if (c == U'\\') {
- /* Can not pass register variable `&src' to function `numeric_escape'
- so make a non-register copy that we can take the address of. */
- src_alias = src;
- if (U'1' <= *src && *src <= U'9') {
- paren_no = (int) * src++ - (int) '0';
- } else if ( (test = literal_escape (*src)) != U'\0') {
- c = test; src++;
- } else if ( (test = numeric_escape (*src, (char32 **) &src_alias)) != U'\0') {
- c = test;
- src = src_alias;
- src ++;
- /* NOTE: if an octal escape for zero is attempted (e.g. \000), it
- will be treated as a literal string. */
- } else if (*src == U'\0') {
- /* If '\' is the last character of the replacement string, it is
- interpreted as a literal backslash. */
- c = U'\\';
- } else {
- c = *src ++; /* Allow any escape sequence (This is */
- } /* INCONSISTENT with the `CompileRE' */
- } /* mind set of issuing an error! */
- if (paren_no < 0) { /* Ordinary character. */
- if ( (dst - dest) >= (max - 1)) {
- *errorType = 1;
- reg_error (U"replacing expression in `SubstituteRE\' too long; truncating");
- anyWarnings = true;
- break;
- } else {
- *dst++ = c;
- }
- } else if (prog->startp [paren_no] != NULL &&
- prog->endp [paren_no] != NULL) {
- len = prog->endp [paren_no] - prog->startp [paren_no];
- if ( (dst + len - dest) >= max - 1) {
- *errorType = 1;
- reg_error (U"replacing expression in `SubstituteRE\' too long; truncating");
- anyWarnings = true;
- len = max - (dst - dest) - 1;
- }
- (void) str32ncpy (dst, prog->startp [paren_no], len);
- if (chgcase != U'\0') {
- adjustcase (dst, len, chgcase);
- }
- dst += len;
- if (len != 0 && * (dst - 1) == U'\0') { /* strncpy hit NUL. */
- *errorType = 3;
- reg_error (U"damaged match string in `SubstituteRE\'");
- anyWarnings = true;
- }
- }
- }
- *dst = U'\0';
- return ! anyWarnings;
- }
- static void adjustcase (char32 *str, int len, char32 chgcase) {
- char32 *string = str;
- /* The tokens \u and \l only modify the first character while the tokens
- \U and \L modify the entire string. */
- if (iswlower ((int) chgcase) && len > 0)
- len = 1;
- switch (chgcase) {
- case U'u':
- case U'U':
- for (integer i = 0; i < len; i ++)
- * (string + i) = Melder_toUpperCase (* (string + i));
- break;
- case U'l':
- case U'L':
- for (integer i = 0; i < len; i ++)
- * (string + i) = Melder_toLowerCase (* (string + i));
- break;
- }
- }
- /*----------------------------------------------------------------------*
- * reg_error
- *----------------------------------------------------------------------*/
- static void reg_error (conststring32 str) {
- Melder_appendError (U"Internal error processing regular expression: ", str);
- }
- void EnableCountingQuantifier (int is_enabled) {
- Enable_Counting_Quantifier = is_enabled;
- }
- /* End of file regularExp.cpp */
|