1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207120812091210121112121213121412151216121712181219122012211222122312241225122612271228122912301231123212331234123512361237123812391240124112421243124412451246124712481249125012511252125312541255125612571258125912601261126212631264126512661267126812691270127112721273127412751276127712781279128012811282128312841285128612871288128912901291129212931294129512961297129812991300130113021303130413051306130713081309131013111312131313141315131613171318131913201321132213231324132513261327132813291330133113321333133413351336133713381339134013411342134313441345134613471348134913501351135213531354135513561357135813591360136113621363136413651366136713681369137013711372137313741375137613771378137913801381138213831384138513861387138813891390139113921393139413951396139713981399140014011402140314041405140614071408140914101411141214131414141514161417141814191420142114221423142414251426142714281429143014311432143314341435143614371438143914401441144214431444144514461447144814491450145114521453145414551456145714581459146014611462146314641465146614671468146914701471147214731474147514761477147814791480148114821483148414851486148714881489149014911492149314941495149614971498149915001501150215031504150515061507150815091510151115121513151415151516151715181519152015211522152315241525152615271528152915301531153215331534153515361537153815391540154115421543154415451546154715481549155015511552155315541555155615571558155915601561156215631564156515661567156815691570157115721573157415751576157715781579158015811582158315841585158615871588158915901591159215931594159515961597159815991600160116021603160416051606160716081609161016111612161316141615161616171618161916201621162216231624162516261627162816291630163116321633163416351636163716381639164016411642164316441645164616471648164916501651165216531654165516561657165816591660166116621663166416651666166716681669167016711672167316741675167616771678167916801681168216831684168516861687168816891690169116921693169416951696169716981699170017011702170317041705170617071708170917101711171217131714171517161717171817191720172117221723172417251726172717281729173017311732173317341735173617371738173917401741174217431744174517461747174817491750175117521753175417551756175717581759176017611762176317641765176617671768176917701771177217731774177517761777177817791780178117821783178417851786178717881789179017911792179317941795179617971798179918001801180218031804180518061807180818091810181118121813181418151816181718181819182018211822182318241825182618271828182918301831183218331834183518361837183818391840184118421843184418451846184718481849185018511852185318541855185618571858185918601861186218631864186518661867186818691870187118721873187418751876187718781879188018811882188318841885188618871888188918901891189218931894189518961897189818991900190119021903190419051906190719081909191019111912191319141915191619171918191919201921192219231924192519261927192819291930193119321933193419351936193719381939194019411942194319441945194619471948194919501951195219531954195519561957195819591960196119621963196419651966196719681969197019711972197319741975197619771978197919801981198219831984198519861987198819891990199119921993199419951996199719981999200020012002200320042005200620072008200920102011201220132014201520162017201820192020202120222023202420252026202720282029203020312032203320342035203620372038203920402041204220432044204520462047204820492050205120522053205420552056205720582059206020612062206320642065206620672068206920702071207220732074207520762077207820792080208120822083208420852086208720882089209020912092209320942095209620972098209921002101210221032104210521062107210821092110211121122113211421152116211721182119212021212122212321242125212621272128212921302131213221332134213521362137213821392140214121422143214421452146214721482149215021512152215321542155215621572158215921602161216221632164216521662167216821692170217121722173217421752176217721782179218021812182218321842185218621872188218921902191219221932194219521962197219821992200220122022203220422052206220722082209221022112212221322142215221622172218221922202221222222232224222522262227222822292230223122322233223422352236223722382239224022412242224322442245224622472248224922502251225222532254225522562257225822592260226122622263226422652266226722682269227022712272227322742275227622772278227922802281228222832284228522862287228822892290229122922293229422952296229722982299230023012302230323042305230623072308230923102311231223132314231523162317231823192320232123222323232423252326232723282329233023312332233323342335233623372338233923402341234223432344234523462347234823492350235123522353235423552356235723582359236023612362236323642365236623672368236923702371237223732374237523762377237823792380238123822383238423852386238723882389239023912392239323942395239623972398239924002401240224032404240524062407240824092410241124122413241424152416241724182419242024212422242324242425242624272428242924302431243224332434243524362437243824392440244124422443244424452446244724482449245024512452245324542455245624572458245924602461246224632464246524662467246824692470247124722473247424752476247724782479248024812482248324842485248624872488248924902491249224932494249524962497249824992500250125022503250425052506250725082509251025112512251325142515251625172518251925202521252225232524252525262527252825292530253125322533253425352536253725382539254025412542254325442545254625472548254925502551255225532554255525562557255825592560256125622563256425652566256725682569257025712572257325742575257625772578257925802581258225832584258525862587258825892590259125922593259425952596259725982599260026012602260326042605260626072608260926102611261226132614261526162617261826192620262126222623262426252626262726282629263026312632263326342635263626372638263926402641264226432644264526462647264826492650265126522653265426552656265726582659266026612662266326642665266626672668266926702671267226732674267526762677267826792680268126822683268426852686268726882689269026912692269326942695269626972698269927002701270227032704270527062707270827092710271127122713271427152716271727182719272027212722272327242725272627272728272927302731273227332734273527362737273827392740274127422743274427452746274727482749275027512752275327542755275627572758275927602761276227632764276527662767276827692770277127722773277427752776277727782779278027812782278327842785278627872788278927902791279227932794279527962797279827992800280128022803280428052806280728082809281028112812281328142815281628172818281928202821282228232824282528262827282828292830283128322833283428352836283728382839284028412842284328442845284628472848284928502851285228532854285528562857285828592860286128622863286428652866286728682869287028712872287328742875287628772878287928802881288228832884288528862887288828892890289128922893289428952896289728982899290029012902290329042905290629072908290929102911291229132914291529162917291829192920292129222923292429252926292729282929293029312932293329342935293629372938293929402941294229432944294529462947294829492950295129522953295429552956295729582959296029612962296329642965296629672968296929702971297229732974297529762977297829792980298129822983298429852986298729882989299029912992299329942995299629972998299930003001300230033004300530063007300830093010301130123013301430153016301730183019302030213022302330243025302630273028302930303031303230333034303530363037303830393040304130423043304430453046304730483049305030513052305330543055305630573058305930603061306230633064306530663067306830693070307130723073307430753076307730783079308030813082308330843085308630873088308930903091309230933094309530963097309830993100310131023103310431053106310731083109311031113112311331143115311631173118311931203121312231233124312531263127312831293130313131323133313431353136313731383139314031413142314331443145314631473148314931503151315231533154315531563157315831593160316131623163316431653166316731683169317031713172317331743175317631773178317931803181318231833184318531863187318831893190319131923193319431953196319731983199320032013202320332043205320632073208320932103211321232133214321532163217321832193220322132223223322432253226322732283229323032313232323332343235323632373238323932403241324232433244324532463247324832493250325132523253325432553256325732583259326032613262326332643265326632673268326932703271327232733274327532763277327832793280328132823283328432853286328732883289329032913292329332943295329632973298329933003301330233033304330533063307330833093310331133123313331433153316331733183319332033213322332333243325332633273328332933303331333233333334333533363337333833393340334133423343334433453346334733483349335033513352335333543355335633573358335933603361336233633364336533663367336833693370337133723373337433753376337733783379338033813382338333843385338633873388338933903391339233933394339533963397339833993400340134023403340434053406340734083409341034113412341334143415341634173418341934203421342234233424342534263427342834293430343134323433343434353436343734383439344034413442344334443445344634473448344934503451345234533454345534563457345834593460346134623463346434653466346734683469347034713472347334743475347634773478347934803481348234833484348534863487348834893490349134923493349434953496349734983499350035013502350335043505350635073508350935103511351235133514351535163517351835193520352135223523352435253526352735283529353035313532353335343535353635373538353935403541354235433544354535463547354835493550355135523553355435553556355735583559356035613562356335643565356635673568356935703571357235733574357535763577357835793580358135823583358435853586358735883589359035913592359335943595359635973598359936003601360236033604360536063607360836093610361136123613361436153616361736183619362036213622362336243625362636273628362936303631363236333634363536363637363836393640364136423643364436453646364736483649365036513652365336543655365636573658365936603661366236633664366536663667366836693670367136723673367436753676367736783679368036813682368336843685368636873688368936903691369236933694369536963697369836993700370137023703370437053706370737083709371037113712371337143715371637173718371937203721372237233724372537263727372837293730373137323733373437353736373737383739374037413742374337443745374637473748374937503751375237533754375537563757375837593760376137623763376437653766376737683769377037713772377337743775377637773778377937803781378237833784378537863787378837893790379137923793379437953796379737983799380038013802380338043805380638073808380938103811381238133814381538163817381838193820382138223823382438253826382738283829383038313832383338343835383638373838383938403841384238433844384538463847384838493850385138523853385438553856385738583859386038613862386338643865386638673868386938703871387238733874387538763877387838793880388138823883388438853886388738883889389038913892389338943895389638973898389939003901390239033904390539063907390839093910391139123913391439153916391739183919392039213922392339243925392639273928392939303931393239333934393539363937393839393940394139423943394439453946394739483949395039513952395339543955395639573958395939603961396239633964396539663967396839693970397139723973397439753976397739783979398039813982398339843985398639873988398939903991399239933994399539963997399839994000400140024003400440054006400740084009401040114012401340144015401640174018401940204021402240234024402540264027402840294030403140324033403440354036403740384039404040414042404340444045404640474048404940504051405240534054405540564057405840594060406140624063406440654066406740684069407040714072407340744075407640774078407940804081408240834084408540864087408840894090409140924093409440954096409740984099410041014102410341044105410641074108410941104111411241134114411541164117411841194120412141224123412441254126412741284129413041314132413341344135413641374138413941404141414241434144414541464147414841494150415141524153415441554156415741584159416041614162416341644165416641674168416941704171417241734174417541764177417841794180418141824183418441854186418741884189419041914192419341944195419641974198419942004201420242034204420542064207420842094210421142124213421442154216421742184219422042214222422342244225422642274228422942304231423242334234423542364237423842394240424142424243424442454246424742484249425042514252425342544255425642574258425942604261426242634264426542664267426842694270427142724273427442754276427742784279428042814282428342844285428642874288428942904291429242934294429542964297429842994300430143024303430443054306430743084309431043114312431343144315431643174318431943204321432243234324432543264327432843294330433143324333433443354336433743384339434043414342434343444345434643474348434943504351435243534354435543564357435843594360436143624363436443654366436743684369437043714372437343744375437643774378437943804381438243834384438543864387438843894390439143924393439443954396439743984399440044014402440344044405440644074408440944104411441244134414441544164417441844194420442144224423442444254426442744284429443044314432443344344435443644374438443944404441444244434444444544464447444844494450445144524453445444554456445744584459446044614462446344644465446644674468446944704471447244734474447544764477447844794480448144824483448444854486448744884489449044914492449344944495449644974498449945004501450245034504450545064507450845094510451145124513451445154516451745184519452045214522452345244525452645274528452945304531453245334534453545364537453845394540454145424543454445454546454745484549455045514552455345544555455645574558455945604561456245634564456545664567456845694570457145724573457445754576457745784579458045814582458345844585458645874588458945904591459245934594459545964597459845994600460146024603460446054606460746084609461046114612461346144615461646174618461946204621462246234624462546264627462846294630463146324633463446354636463746384639464046414642464346444645464646474648464946504651465246534654465546564657465846594660466146624663466446654666466746684669467046714672467346744675467646774678467946804681468246834684468546864687468846894690469146924693469446954696469746984699470047014702470347044705470647074708470947104711471247134714471547164717471847194720472147224723472447254726472747284729473047314732473347344735473647374738473947404741474247434744474547464747474847494750475147524753475447554756475747584759476047614762476347644765476647674768476947704771477247734774477547764777477847794780478147824783478447854786478747884789479047914792479347944795479647974798479948004801480248034804480548064807480848094810481148124813481448154816481748184819482048214822482348244825482648274828482948304831483248334834483548364837483848394840484148424843484448454846484748484849485048514852485348544855485648574858485948604861486248634864486548664867486848694870487148724873487448754876487748784879488048814882488348844885488648874888488948904891489248934894489548964897489848994900490149024903490449054906490749084909491049114912491349144915491649174918491949204921492249234924492549264927492849294930493149324933493449354936493749384939494049414942494349444945494649474948494949504951495249534954495549564957495849594960496149624963496449654966496749684969497049714972497349744975497649774978497949804981498249834984498549864987498849894990499149924993499449954996499749984999500050015002500350045005500650075008500950105011501250135014501550165017501850195020502150225023502450255026502750285029503050315032503350345035503650375038503950405041504250435044504550465047504850495050505150525053505450555056505750585059506050615062506350645065506650675068506950705071507250735074507550765077507850795080508150825083508450855086508750885089509050915092509350945095509650975098509951005101510251035104510551065107510851095110511151125113511451155116511751185119512051215122512351245125512651275128512951305131513251335134513551365137513851395140514151425143514451455146514751485149515051515152515351545155515651575158515951605161516251635164516551665167516851695170517151725173517451755176517751785179518051815182518351845185518651875188518951905191519251935194519551965197519851995200520152025203520452055206520752085209521052115212521352145215521652175218521952205221522252235224522552265227522852295230523152325233523452355236523752385239524052415242524352445245524652475248524952505251525252535254525552565257525852595260526152625263526452655266526752685269527052715272527352745275527652775278527952805281528252835284528552865287528852895290529152925293529452955296529752985299530053015302530353045305530653075308530953105311531253135314531553165317531853195320532153225323532453255326532753285329533053315332533353345335533653375338533953405341534253435344534553465347534853495350535153525353535453555356535753585359536053615362536353645365536653675368536953705371537253735374537553765377537853795380538153825383538453855386538753885389539053915392539353945395539653975398539954005401540254035404540554065407540854095410541154125413541454155416541754185419542054215422542354245425542654275428542954305431543254335434543554365437543854395440544154425443544454455446544754485449545054515452545354545455545654575458545954605461546254635464546554665467546854695470547154725473547454755476547754785479548054815482548354845485548654875488548954905491549254935494549554965497549854995500550155025503550455055506550755085509551055115512551355145515551655175518551955205521552255235524552555265527552855295530553155325533553455355536553755385539554055415542554355445545554655475548554955505551555255535554555555565557555855595560556155625563556455655566556755685569557055715572557355745575557655775578557955805581558255835584558555865587558855895590559155925593559455955596559755985599560056015602560356045605560656075608560956105611561256135614561556165617561856195620562156225623562456255626562756285629563056315632563356345635563656375638563956405641564256435644564556465647564856495650565156525653565456555656565756585659566056615662566356645665566656675668566956705671567256735674567556765677567856795680568156825683568456855686568756885689569056915692569356945695569656975698569957005701570257035704570557065707570857095710571157125713571457155716571757185719572057215722572357245725572657275728572957305731573257335734573557365737573857395740574157425743574457455746574757485749575057515752575357545755575657575758575957605761576257635764576557665767576857695770577157725773577457755776577757785779578057815782578357845785578657875788578957905791579257935794579557965797579857995800580158025803580458055806580758085809581058115812581358145815581658175818581958205821582258235824582558265827582858295830583158325833583458355836583758385839584058415842584358445845584658475848584958505851585258535854585558565857585858595860586158625863586458655866586758685869587058715872587358745875587658775878587958805881588258835884588558865887588858895890589158925893589458955896589758985899590059015902590359045905590659075908590959105911591259135914591559165917591859195920592159225923592459255926592759285929593059315932593359345935593659375938593959405941594259435944594559465947594859495950595159525953595459555956595759585959596059615962596359645965596659675968596959705971597259735974597559765977597859795980598159825983598459855986598759885989599059915992599359945995599659975998599960006001600260036004600560066007600860096010601160126013601460156016601760186019602060216022602360246025602660276028602960306031603260336034603560366037603860396040604160426043604460456046604760486049605060516052605360546055605660576058605960606061606260636064606560666067606860696070607160726073607460756076607760786079608060816082608360846085608660876088608960906091609260936094609560966097609860996100610161026103610461056106610761086109611061116112611361146115611661176118611961206121612261236124612561266127612861296130613161326133613461356136613761386139614061416142614361446145614661476148614961506151615261536154615561566157615861596160616161626163616461656166616761686169617061716172617361746175617661776178617961806181618261836184618561866187618861896190619161926193619461956196619761986199620062016202620362046205620662076208620962106211621262136214621562166217621862196220622162226223622462256226622762286229623062316232623362346235623662376238623962406241624262436244624562466247624862496250625162526253625462556256625762586259626062616262626362646265626662676268626962706271627262736274627562766277627862796280628162826283628462856286628762886289629062916292629362946295629662976298629963006301630263036304630563066307630863096310631163126313631463156316631763186319632063216322632363246325632663276328632963306331633263336334633563366337633863396340634163426343634463456346634763486349635063516352635363546355635663576358635963606361636263636364636563666367636863696370637163726373637463756376637763786379638063816382638363846385638663876388638963906391639263936394639563966397639863996400640164026403640464056406640764086409641064116412641364146415641664176418641964206421642264236424642564266427642864296430643164326433643464356436643764386439644064416442644364446445644664476448644964506451645264536454645564566457645864596460646164626463646464656466646764686469647064716472647364746475647664776478647964806481648264836484648564866487648864896490649164926493649464956496649764986499650065016502650365046505650665076508650965106511651265136514651565166517651865196520652165226523652465256526652765286529653065316532653365346535653665376538653965406541654265436544654565466547654865496550655165526553655465556556655765586559656065616562656365646565656665676568656965706571657265736574657565766577657865796580658165826583658465856586658765886589659065916592659365946595659665976598659966006601660266036604660566066607660866096610661166126613661466156616661766186619662066216622662366246625662666276628662966306631663266336634663566366637663866396640664166426643664466456646664766486649665066516652665366546655665666576658665966606661666266636664666566666667666866696670667166726673667466756676667766786679668066816682668366846685668666876688 |
- <!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN"
- "http://www.w3.org/TR/html4/loose.dtd">
- <html>
- <head>
- <meta http-equiv="Content-Type" content="text/html; charset=utf-8">
- <meta http-equiv="Content-Language" content="en-us">
- <title>UAX #44: Unicode Character Database</title>
- <link rel="stylesheet" type="text/css" href="http://www.unicode.org/reports/reports-v2.css">
- <style type="text/css">
- th { background-color: #CCFFCC }
- td.lightgray { background-color: #E4E4E4 }
- </style>
- </head>
- <body>
- <table class="header" cellspacing="0" cellpadding="0" width="100%">
- <tr>
- <td class="icon"><a href="http://www.unicode.org">
- <img align="middle" alt="[Unicode]" border="0" src="http://www.unicode.org/webscripts/logo60s2.gif" width="34" height="33"></a>
- <a class="bar" href="http://www.unicode.org/reports/">Technical Reports</a></td>
- </tr>
- <tr>
- <td class="gray"> </td>
- </tr>
- </table>
- <div class="body">
- <!--
- <h2 class="uaxtitle"><span class="changedspan">Proposed Update</span></h2>
- -->
- <h2 class="uaxtitle">Unicode® Standard Annex #44</h2>
- <h1>Unicode Character Database</h1>
- <table class="simple" width="90%">
- <tr>
- <td valign="top" width="20%">Version</td>
- <td valign="top">Unicode 10.0.0</td>
- </tr>
- <tr>
- <td valign="top">Editors</td>
- <td valign="top"><a href="https://plus.google.com/114199149796022210033?rel=author">Mark Davis</a> (<a href="mailto:markdavis@google.com">markdavis@google.com</a>), Laurențiu Iancu (<a href="mailto:liancu@unicode.org">liancu@unicode.org</a>)
- and Ken Whistler (<a href="mailto:ken@unicode.org">ken@unicode.org</a>)</td>
- </tr>
- <tr>
- <td valign="top">Date</td>
- <td valign="top">2017-06-14</td>
- </tr>
- <tr>
- <td valign="top">This Version</td>
- <td valign="top">
- <a href="http://www.unicode.org/reports/tr44/tr44-20.html">http://www.unicode.org/reports/tr44/tr44-20.html</a>
- </td>
- </tr>
- <tr>
- <td valign="top">Previous Version</td>
- <td valign="top">
- <a href="http://www.unicode.org/reports/tr44/tr44-18.html">http://www.unicode.org/reports/tr44/tr44-18.html</a>
- </td>
- </tr>
- <tr>
- <td valign="top">Latest Version</td>
- <td valign="top"><a href="http://www.unicode.org/reports/tr44/">http://www.unicode.org/reports/tr44/</a></td>
- </tr>
- <tr>
- <td valign="top">Latest Proposed Update</td>
- <td valign="top"><a href="http://www.unicode.org/reports/tr44/proposed.html">http://www.unicode.org/reports/tr44/proposed.html</a></td>
- </tr>
- <tr>
- <td valign="top">Revision</td>
- <td valign="top"><a href="#Modifications">20</a></td>
- </tr>
- </table>
-
- <h4 class="summary">Summary</h4>
- <blockquote>
- <p><i>This annex provides the core documentation for the
- Unicode Character Database (UCD). It describes the layout and organization of the Unicode
- Character Database and how it specifies the formal definitions of the Unicode Character Properties.</i></p>
- </blockquote>
-
- <h4 class="status">Status</h4>
- <!-- NOT YET APPROVED
- <p><i><span class="changed">This is a<b><font color="#ff3333"> draft </font></b>document which
- may be updated, replaced, or superseded by other documents at any time.
- Publication does not imply endorsement by the Unicode Consortium. This is
- not a stable document; it is inappropriate to cite this document as other
- than a work in progress.</span></i></p>
- END NOT YET APPROVED -->
- <!-- APPROVED -->
- <p><i>This document has been reviewed by Unicode members and other interested
- parties, and has been approved for publication by the Unicode Consortium.
- This is a stable document and may be used as reference material or cited as
- a normative reference by other specifications.</i></p>
- <!-- END APPROVED -->
- <blockquote>
- <p><i><b>A Unicode Standard Annex (UAX)</b> forms an integral part of the
- Unicode Standard, but is published online as a separate document. The
- Unicode Standard may require conformance to normative content in a Unicode
- Standard Annex, if so specified in the Conformance chapter of that version
- of the Unicode Standard. The version number of a UAX document corresponds to
- the version of the Unicode Standard of which it forms a part.</i></p>
- </blockquote>
- <p><i>Please submit corrigenda and other comments with the online reporting
- form [<a href="http://www.unicode.org/reporting.html">Feedback</a>].
- Related information that is useful in understanding this annex is found in Unicode Standard Annex #41,
- “<a href="http://www.unicode.org/reports/tr41/tr41-21.html">Common References for Unicode Standard Annexes</a>.”
- For the latest version of the Unicode Standard, see [<a href="http://www.unicode.org/versions/latest/">Unicode</a>].
- For a list of current Unicode Technical Reports, see [<a href="http://www.unicode.org/reports/">Reports</a>].
- For more information about versions of the Unicode Standard, see [<a href="http://www.unicode.org/versions/">Versions</a>].
- For any errata which may apply to this annex, see [<a href="http://www.unicode.org/errata/">Errata</a>].</i></p>
-
- <h4 class="contents">Contents</h4>
- <ul class="toc">
- <li>1 <a href="#Introduction">Introduction</a></li>
- <li>2 <a href="#Conformance">Conformance</a>
- <ul class="toc">
- <li>2.1 <a href="#Simple_Derived">Simple and Derived Properties</a></li>
- <li>2.2 <a href="#Use_Default">Use of Default Values</a></li>
- <li>2.3 <a href="#Release_Stability">Stability of Releases</a></li>
- </ul></li>
- <li>3 <a href="#Documentation_Files">Documentation</a>
- <ul class="toc">
- <li>3.1 <a href="#Character_Properties">Character Properties in the Standard</a></li>
- <li>3.2 <a href="#Property_Model">The Character Property Model</a></li>
- <li>3.3 <a href="#NamesList">NamesList.html</a></li>
- <li>3.4 <a href="#StandardizedVariants">StandardizedVariants.html</a></li>
- <li>3.5 <a href="#EmojiVariants">Emoji Variation Sequences</a></li>
- <li>3.6 <a href="#Unihan">Unihan and UAX #38</a></li>
- <li>3.7 <a href="#USource">UTC-Source Ideographs and UAX #45</a></li>
- <li>3.8 <a href="#Data_File_Comments">Data File Comments</a></li>
- <li>3.9 <a href="#Obsolete">Obsolete Documentation Files</a></li>
- </ul></li>
- <li>4 <a href="#UCD_Files">UCD Files</a>
- <ul class="toc">
- <li>4.1 <a href="#Directory_Structure">Directory Structure</a></li>
- <li>4.2 <a href="#Format_Conventions">File Format Conventions</a></li>
- <li>4.3 <a href="#File_List">File List</a></li>
- <li>4.4 <a href="#Zipped_Files">Zipped Files</a></li>
- <li>4.5 <a href="#UCD_in_XML">UCD in XML</a></li>
- </ul></li>
- <li>5 <a href="#Properties">Properties</a>
- <ul class="toc">
- <li>5.1 <a href="#Property_Index">Property Index</a></li>
- <li>5.2 <a href="#About_Property_Table">About the Property Table</a></li>
- <li>5.3 <a href="#Property_Definitions">Property Definitions</a></li>
- <li>5.4 <a href="#Derived_Extracted">Derived Extracted Properties</a></li>
- <li>5.5 <a href="#Contributory_Properties">Contributory Properties</a></li>
- <li>5.6 <a href="#Casemapping">Case and Case Mapping</a></li>
- <li>5.7 <a href="#Property_Values">Property Value Lists</a></li>
- <li>5.8 <a href="#Property_And_Value_Aliases">Property and Property Value Aliases</a></li>
- <li>5.9 <a href="#Matching_Rules">Matching Rules</a></li>
- <li>5.10 <a href="#Invariants">Invariants</a></li>
- <li>5.11 <a href="#Validation">Validation</a></li>
- <li>5.12 <a href="#Deprecation">Deprecation</a></li>
- <li>5.13 <a href="#Property_APIs">Property APIs</a></li>
- <li>5.14 <a href="#Character_Age">Character Age</a></li>
- </ul></li>
- <li>6 <a href="#Test_Files">Test Files</a>
- <ul class="toc">
- <li>6.1 <a href="#NormalizationTest_txt">NormalizationTest.txt</a></li>
- <li>6.2 <a href="#Segmentation_Test_Files">Segmentation Test Files and Documentation</a></li>
- <li>6.3 <a href="#BidiTest_txt">Bidirectional Test Files</a></li>
- </ul></li>
- <li>7 <a href="#Change_History">UCD Change History</a></li>
- <li><a href="#Acknowledgments">Acknowledgments</a></li>
- <li><a href="#References">References</a></li>
- <li><a href="#Modifications">Modifications</a></li>
- </ul>
- <hr>
- <blockquote>
- <p><i><b>Note:</b> the information in
- this annex is not intended as an exhaustive description of the use and
- interpretation of Unicode character properties and behavior. It must be used in conjunction with
- the data in the other files in the Unicode Character Database, and relies on the notation and
- definitions supplied in <a href="http://www.unicode.org/standard/standard.html">The Unicode
- Standard</a>. All chapter references are to Version
- 10.0.0 of the standard unless otherwise indicated.</i></p>
- </blockquote>
- <h2>1 <a name="Introduction" href="#Introduction">Introduction</a></h2>
-
- <p>The Unicode Standard is far more than a simple encoding of characters.
- The standard also associates a rich set of semantics with each encoded
- character—properties that
- are required for interoperability and correct behavior in
- implementations, as well as for Unicode conformance.
- These semantics are cataloged in the Unicode Character Database (UCD), a collection of data files
- which contain the Unicode character code points and character names.
- The data files define the Unicode character properties and mappings between
- Unicode characters (such as case mappings).</p>
-
- <p>This annex describes the UCD and provides a guide to the various
- documentation files associated with it. Additional information
- about character properties and their use is contained in the
- Unicode Standard and its annexes. In particular, implementers should familiarize themselves
- with the formal definitions and conformance requirements for properties detailed
- in <i>Section 3.5, Properties</i> in [<a href="../tr41/tr41-21.html#Unicode">Unicode</a>]
- and with the material in <i>Chapter 4, Character Properties</i> in [<a href="../tr41/tr41-21.html#Unicode">Unicode</a>].</p>
-
- <p>The latest version of the UCD is always located on the Unicode
- website at:</p>
- <blockquote>
- <a href="http://www.unicode.org/Public/UCD/latest/">http://www.unicode.org/Public/UCD/latest/</a>
- </blockquote>
- <p>The specific files for the UCD associated with this version of
- the Unicode Standard (10.0.0) are located at:</p>
- <blockquote>
- <a href="http://www.unicode.org/Public/10.0.0/">http://www.unicode.org/Public/10.0.0/</a>
- </blockquote>
- <p>Stable, archived versions of the UCD associated with all earlier
- versions of the Unicode Standard can be accessed from: </p>
- <blockquote>
- <a href="http://www.unicode.org/ucd/">http://www.unicode.org/ucd/</a>
- </blockquote>
-
- <p>For a description of the changes in the UCD for
- this version and earlier versions, see the
- <a href="#Change_History">UCD Change History</a>.</p>
-
- <h2>2 <a name="Conformance" href="#Conformance">Conformance</a></h2>
-
- <p>The Unicode Character Database is an integral part of the Unicode Standard.</p>
-
- <p>The UCD contains normative property and mapping information required for
- implementation of various Unicode algorithms such as the Unicode Bidirectional
- Algorithm, Unicode Normalization, and Unicode Casefolding. The data files also
- contain additional informative and provisional character property information.</p>
- <p>Each specification of a Unicode algorithm, whether specified in the text of
- [<a href="../tr41/tr41-21.html#Unicode">Unicode</a>] or in one of the Unicode
- Standard Annexes, designates which data file(s) in the UCD are needed to
- provide normative property information required by that algorithm.</p>
- <p>For information on the meaning and application of the terms,
- <i>normative</i>, <i>informative</i>, and <i>provisional</i>, see <i>Section 3.5,
- Properties</i> in [<a href="../tr41/tr41-21.html#Unicode">Unicode</a>].</p>
- <p>For information about the applicable terms of use for the
- UCD, see the Unicode <a href="http://www.unicode.org/copyright.html">Terms of Use</a>.</p>
- <h3>2.1 <a name="Simple_Derived" href="#Simple_Derived">Simple and Derived Properties</a></h3>
- <h4>2.1.1 <a name="Simple_Props" href="#Simple_Props">Simple Properties</a></h4>
- <p>Some character properties in the UCD are simple properties.
- This status has no bearing on whether or not the properties are
- normative, but merely indicates that their values
- are not derived from some combination of other properties.</p>
- <h4>2.1.2 <a name="Derived_Props" href="#Derived_Props">Derived Properties</a></h4>
- <p>Other character properties are derived. This means that
- their values are derived by rule from some other
- combination of properties. Generally such rules are
- stated as set operations, and may or may not include
- explicit exception lists for individual characters.</p>
- <p>Certain simple properties are defined merely
- to make the statement of the rule defining a derived
- property more compact or general. Such properties are
- known as <a href="#Contributory_Properties">contributory properties</a>.
- Sometimes these contributory properties are defined to
- encapsulate the messiness inherent in exception
- lists. At other times, a contributory property may
- be defined to help stabilize the definition of
- an important derived property which is subject to stability
- guarantees.</p>
- <p>Derived character properties are not considered
- second-class citizens among Unicode character properties.
- They are defined to make implementation of important
- algorithms easier to state. Included among the
- first-class derived properties important for such
- implementations are: Uppercase, Lowercase, XID_Start,
- XID_Continue, Math, and Default_Ignorable_Code_Point, all
- defined in DerivedCoreProperties.txt, as well as derived
- properties for the optimization of normalization, defined
- in DerivedNormalizationProps.txt.</p>
- <p>Implementations should simply use the derived properties,
- and should not try to rederive them from lists of simple
- properties and collections of rules, because of the
- chances for error and divergence when doing so.</p>
- <p>Definitions of property derivations are provided
- for information only, typically in comment fields
- in the data files. Such definitions may be refactored,
- refined, or corrected over time. These
- definitions are presented in a modified set notation, expressed
- as set additions and/or subtractions of various other property
- values. For example:</p>
- <blockquote>
- <pre>
- # Derived Property: ID_Start
- # Characters that can start an identifier.
- # Generated from:
- # Lu + Ll + Lt + Lm + Lo + Nl
- # + Other_ID_Start
- # - Pattern_Syntax
- # - Pattern_White_Space
- </pre>
- </blockquote>
- <p>When interpreting definitions of derived properties
- of this sort, keep in mind that set subtraction is not a commutative
- operation. Thus "Lo + Lm - Pattern_Syntax" defines a different set
- than "Lo - Pattern_Syntax + Lm". The order of property set operations
- stated in the definitions affects the composition of
- the derived set.</p>
- <p>If there are any cases of mismatches
- between the definition of a derived property as
- listed in DerivedCoreProperties.txt or similar data
- files in the UCD, and the definition of a derived
- property as a set definition rule, the explicit
- listing in the data file should <i>always</i> be taken
- as the normative definition of the property. As described
- in <a href="#Release_Stability">Stability of Releases</a> the property
- listing in the data files for any given version
- of the standard will never change for that version.</p>
- <h4>2.1.3 <a name="Props_External" href="#Props_External">Properties Dependent on External Specifications</a></h4>
- <p>In limited cases, a Unicode character property defined in the Unicode Character Database
- may have an external dependency on another specification which is not a part of the Unicode Standard,
- and whose data is not formally part of the UCD. In such cases, version stabiity for the UCD is attained by
- requiring that dependency to be based on a known, published version of the external specification.</p>
- <p>As of Version 10.0 of the UCD, the clear example of such an external dependency is the
- derivation of some segmentation-related character properties, in part based on emoji properties associated with
- UTS #51, "Unicode Emoji" [<a href="../tr41/tr41-21.html#UTS51">UTS51</a>]. The details of the
- derivation are described in the respective annexes, [<a href="../tr41/tr41-21.html#UAX14">UAX14</a>]
- and [<a href="../tr41/tr41-21.html#UAX29">UAX29</a>], as well as in the documentation portions of
- the associated UCD property files. See [<a href="../tr41/tr41-21.html#Data14">Data14</a>]
- and [<a href="../tr41/tr41-21.html#Props0">Props</a>].
- The version of UTS #51 used for those segmentation properties in Version 10.0 of the UCD is clearly
- identified in those annexes and data files.</p>
- <p>An external dependency may impact either a simple or a derived property. For example,
- the Line_Break property is considered a simple, enumerated property. However, two of the enumerated
- values, lb=Emoji_Base and lb=Emoji_Modifier, are synchronized with the associated emoji properties in
- emoji-data.txt. In the case of the derived segmentation properties associated with UAX #29,
- Grapheme_Cluster_Break, Word_Break, and Sentence_Break, the dependencies are considerably more complex.
- See [<a href="../tr41/tr41-21.html#UAX29">UAX29</a>] for full details.</p>
- <h3>2.2 <a name="Use_Default" href="#Use_Default">Use of Default Values</a></h3>
- <p>Unicode character properties have default values. Default
- values are the value or values that a character property takes
- for an unassigned code point, or in some instances, for
- designated subranges of code points, whether assigned or
- unassigned. For example, the default value of a binary
- Unicode character property is always "N".</p>
- <p>For the formal discussion of default values, see D26 in
- <i>Section 3.5, Properties</i> in [<a href="../tr41/tr41-21.html#Unicode">Unicode</a>].
- For conventions related to default values in various data files
- of the UCD and for documentation regarding the particular default values of
- individual Unicode character properties, see <a href="#Default_Values">Default Values</a>.</p>
- <h3>2.3 <a name="Release_Stability" href="#Release_Stability">Stability of Releases</a></h3>
- <p>Just as for the Unicode Standard as a whole, each version of the
- UCD, once published, is absolutely stable and will <i>never</i>
- change. Each released version is archived in a directory on
- the Unicode website, with a directory number associated with
- that version. URLs pointing to that version's directory are also
- stable and will be maintained in perpetuity.</p>
- <p>Any errors discovered for a released version of the UCD
- are noted in [<a href="../tr41/tr41-21.html#Errata">Errata</a>],
- and if appropriate will be corrected in a <i>subsequent</i>
- version of the UCD.</p>
- <p>Stability guarantees constraining how Unicode character
- properties can (or cannot) change between releases of the UCD
- are documented in the Unicode Consortium Stability
- Policies [<a href="../tr41/tr41-21.html#Stability">Stability</a>].</p>
- <h4>2.3.1 <a name="Allowed_Changes" href="#Allowed_Changes">Changes to Properties Between Releases</a></h4>
- <p>Updates to character properties in the Unicode Character Database may be required
- for any of three reasons:</p>
- <ol>
- <li>To cover new characters added to the standard</li>
- <li>To add new character properties to the standard</li>
- <li>To change the assigned values for a property for some characters already in the standard</li>
- </ol>
- <p>While the Unicode Consortium endeavors to keep the values of all
- character properties as stable as possible between versions, occasionally circumstances
- may arise which require changing them. In particular, as less well-documented scripts, such
- as those for minority languages, or historic scripts are added to the standard, the exact
- character properties and behavior may not fully be known when the script is first encoded.
- The properties for some of these characters may change as further information becomes
- available or as implementations turn up problems in the initial property assignments.
- As far as possible, any readjustment of property values based
- on growing implementation experience is made to be compatible with established practice.</p>
- <p>All changes to normative or informative property values, to the status
- or type of a property, or to property or property value aliases, must be approved by
- an explicit decision taken by the Unicode Technical Committee. Changes to provisional
- property values are subject to less stringent oversight.</p>
- <p>Occasionally, a character property value is changed to prevent incorrect generalizations
- about a character's use based on its nominal property values. For example, U+200B ZERO
- WIDTH SPACE was originally classified as a space character (General_Category=Zs), but
- it was reclassified as a Format character (General_Category=Cf) to clearly distinguish it from space characters
- in its function as a format control for line breaking.</p>
- <p>There is no guarantee that a particular value for an enumerated
- property will actually have characters associated with it. Also, because of
- changes in property value assignments between versions of the standard, a
- property value that once had characters associated with it may later have none.
- Such conditions and changes are rare, but implementations must not
- assume that all property values are associated with non-null
- sets of characters. For example, currently the special Script property
- value Katakana_Or_Hiragana has no characters associated with it.</p>
- <h4>2.3.2 <a name="Obsolete_Properties" href="#Obsolete_Properties">Obsolete Properties</a></h4>
- <p>In some instances an entire property may become <i>obsolete</i>.
- For example, the <a href="#ISO_Comment">ISO_Comment</a> property was once used to keep
- track of annotations for characters used in the production of name lists for
- ISO/IEC 10646 code charts. As of Unicode 5.2.0 that property became obsolete,
- and its value is now defaulted to the null string for all Unicode code points.</p>
- <p>An obsolete property is never removed from the UCD.</p>
- <h4>2.3.3 <a name="Deprecated_Properties" href="#Deprecated_Properties">Deprecated Properties</a></h4>
- <p>Occasionally an obsolete property may also be formally
- <i>deprecated</i>. This is an indication that the property is no longer recommended for
- use, perhaps because its original intent has been replaced by another property
- or because its specification was somehow defective. See also the
- general discussion of <a href="#Deprecation">Deprecation</a>.</p>
- <p>A deprecated property is never removed from the UCD.</p>
- <p><i>Table 1</i> lists the properties that are formally deprecated as of
- this version of the Unicode Standard.</p>
- <p class="caption">Table 1. <a name="Deprecated_Property_Table" href="#Deprecated_Property_Table">Deprecated Properties</a></p>
- <div align="center">
-
- <table class="simple">
- <tr>
- <th>Property Name</th>
- <th>Deprecation Version</th>
- <th>Reason</th>
- </tr>
- <tr>
- <td><a href="#Grapheme_Link">Grapheme_Link</a></td>
- <td>5.0.0</td>
- <td>Duplication of ccc=9</td>
- </tr>
- <tr>
- <td><a href="#Hyphen">Hyphen</a></td>
- <td>6.0.0</td>
- <td>Supplanted by Line_Break property values</td>
- </tr>
- <tr>
- <td><a href="#ISO_Comment">ISO_Comment</a></td>
- <td>6.0.0</td>
- <td>No longer needed for chart generation; otherwise not useful</td>
- </tr>
- <tr>
- <td><a href="#Expands_On_NFC">Expands_On_NFC</a></td>
- <td>6.0.0</td>
- <td>Less useful than UTF-specific calculations</td>
- </tr>
- <tr>
- <td><a href="#Expands_On_NFD">Expands_On_NFD</a></td>
- <td>6.0.0</td>
- <td>Less useful than UTF-specific calculations</td>
- </tr>
- <tr>
- <td><a href="#Expands_On_NFKC">Expands_On_NFKC</a></td>
- <td>6.0.0</td>
- <td>Less useful than UTF-specific calculations</td>
- </tr>
- <tr>
- <td><a href="#Expands_On_NFKD">Expands_On_NFKD</a></td>
- <td>6.0.0</td>
- <td>Less useful than UTF-specific calculations</td>
- </tr>
- <tr>
- <td><a href="#FC_NFKC_Closure">FC_NFKC_Closure</a></td>
- <td>6.0.0</td>
- <td>Supplanted in usage by <a href="#NFKC_Casefold">NFKC_Casefold</a>; otherwise not useful</td>
- </tr>
- </table>
- </div>
- <p> </p>
-
- <h4>2.3.4 <a name="Stabilized_Properties" href="#Stabilized_Properties">Stabilized Properties</a></h4>
- <p>Another possibility is that an obsolete property may be
- declared to be <i>stabilized</i>. Such a determination does not indicate that
- the property should or should not be used; instead it is a declaration that the
- UTC (Unicode Technical Committee) will no longer actively maintain the property or extend it for newly
- encoded characters. The property values of a
- stabilized property are frozen as of a particular release of the standard.</p>
- <p>A stabilized property is never removed from the UCD.</p>
- <p><i>Table 2</i> lists the properties that are formally stabilized as of
- this version of the Unicode Standard.</p>
- <p class="caption">Table 2. <a name="Stabilized_Property_Table" href="#Stabilized_Property_Table">Stabilized Properties</a></p>
- <div align="center">
-
- <table class="simple">
- <tr>
- <th>Property Name</th>
- <th>Stabilization Version</th>
- </tr>
- <tr>
- <td><a href="#Hyphen">Hyphen</a></td>
- <td>4.0.0</td>
- </tr>
- <tr>
- <td><a href="#ISO_Comment">ISO_Comment</a></td>
- <td>6.0.0</td>
- </tr>
- </table>
- </div>
- <p> </p>
-
- <h2>3 <a name="Documentation_Files" href="#Documentation_Files">Documentation</a></h2>
- <p>This annex provides the core documentation for the UCD, but
- additional information about character properties is available in
- other parts of the standard and in additional documentation files
- contained within the UCD.</p>
- <h3>3.1 <a name="Character_Properties" href="#Character_Properties">Character Properties in the Standard</a></h3>
- <p>The formal definitions related to character properties used
- by the Unicode Standard are documented in
- <i>Section 3.5, Properties</i> in [<a href="../tr41/tr41-21.html#Unicode">Unicode</a>].
- Understanding those definitions and related terminology is
- essential to the appropriate use of Unicode character properties.</p>
- <p>See <i>Section 4.1, Unicode Character Database</i>, in
- [<a href="../tr41/tr41-21.html#Unicode">Unicode</a>] for a general
- discussion of the UCD and its use in defining properties. The
- rest of Chapter 4 provides important explanations regarding
- the meaning and use of various normative character properties.</p>
- <h3>3.2 <a name="Property_Model" href="#Property_Model">The Character Property Model</a></h3>
- <p>For a general discussion of the property model which underlies
- the definitions associated with the UCD, see
- Unicode Technical Report #23, "The Unicode Character Property Model" [<a href="../tr41/tr41-21.html#UTR23">UTR23</a>].
- That technical report is informative, but over the years various
- content from it has been incorporated into normative portions
- of the Unicode Standard, particularly for the definitions in
- Chapter 3.</p>
-
- <p>UTR #23 also discusses string functions and their relation to
- character properties.</p>
- <h3>3.3 <a name="NamesList" href="#NamesList">NamesList.html</a></h3>
- <p>NamesList.html formally describes the format of the NamesList.txt data file in BNF.
- That data file is used to drive the printing
- of the Unicode code charts and names list. See also <i>Section 24.1,
- Character Names List</i>, in [<a href="../tr41/tr41-21.html#Unicode">Unicode</a>]
- for a detailed discussion of the conventions used in the Unicode names list as
- formatted for printing.</p>
- <h3>3.4 <a name="StandardizedVariants" href="#StandardizedVariants">StandardizedVariants.html</a></h3>
- <p>StandardizedVariants.html has been obsoleted
- as of Version 9.0 of the UCD. This file formerly
- documented standardized variants, showing a
- representative glyph for each. It was closely tied to the data file,
- StandardizedVariants.txt, which defines those sequences normatively.</p>
- <p>The function of StandardizedVariants.html to show representative
- glyphs for standardized variants has been superseded. There are now better means
- of illustrating the glyphs. Many standardized variation sequences are shown
- in the Unicode code charts directly, in summary sections at the ends of the
- names list for any block which contains them. Glyphs for standardized variants
- of CJK compatibility ideographs are also shown directly in the Unicode
- code charts. Because of the specialized font display requirements for
- emoji, often involving color, the standardized emoji variation sequences are not shown in the
- Unicode code charts, but have their own dedicated display page instead.</p>
- <h3>3.5 <a name="EmojiVariants" href="#EmojiVariants">Emoji Variation Sequences</a></h3>
- <p>Starting with Version 9.0.0, the following page in the Unicode emoji
- subsite area shows appropriate representative glyphs for all emoji variation sequences:</p>
- <p><a href="http://www.unicode.org/emoji/charts/emoji-variants.html">http://www.unicode.org/emoji/charts/emoji-variants.html</a></p>
- <p>Emoji variation sequences are a subset of standardized variation sequences,
- consisting of an emoji base followed either by the variation selector U+FE0E or the
- variation selector U+FE0F. Such sequences come in pairs, with the sequence using U+FE0E
- shown with a black and white text presentation, as seen in the Unicode code charts,
- and with the sequence using U+FE0F shown with a colorful icon, as usually seen
- in emoji implementations on mobile devices and elsewhere.</p>
- <h3>3.6 <a name="Unihan" href="#Unihan">Unihan and UAX #38</a></h3>
- <p>Unicode Standard Annex #38, "Unicode Han Database (Unihan)"
- [<a href="../tr41/tr41-21.html#UAX38">UAX38</a>] describes
- the format and content of the Unihan Database, which collects together all property information
- for CJK Unified Ideographs. That annex also specifies in detail
- which of the Unihan character properties are normative,
- informative, or provisional.</p>
- <p>The Unihan Database contains extensive and detailed mapping
- information for CJK Unified Ideographs encoded in the Unicode Standard,
- but it is aimed <i>only</i> at those ideographs, not at other characters used in the East
- Asian context in general.
- In contrast, East Asian legacy character sets, including important
- commercial and national character set standards, contain many non-CJK
- characters. As a result, the Unihan Database must be supplemented from
- other sources to establish mapping tables for those character sets.</p>
- <p>The majority of the content of the Unihan Database is
- released for each version of the Unicode Standard as a collection of Unihan data
- files in the UCD. Because of their large size, these data files are released only as
- a zipped file, Unihan.zip. The details of the particular data files in Unihan.zip
- and the CJK properties each one contains are provided in [<a href="../tr41/tr41-21.html#UAX38">UAX38</a>].
- For versions of the UCD prior to Version 5.2.0, all of the CJK properties were
- listed together in a very large, single file, Unihan.txt.</p>
- <h3>3.7 <a name="USource" href="#USource">UTC-Source Ideographs and UAX #45</a></h3>
- <p>Unicode Standard Annex #45, "U-Source Ideographs"
- [<a href="../tr41/tr41-21.html#UAX45">UAX45</a>] describes the format of USourceData.txt,
- which lists all of the information for UTC-Source ideographs.</p>
- <h3>3.8 <a name="Data_File_Comments" href="#Data_File_Comments">Data File Comments</a></h3>
- <p>In addition to the specific documentation files for the UCD, individual data
- files often contain extensive header comments describing their content and any
- special conventions used in the data.</p>
- <p>In some instances, individual property
- definition sections also contain comments with information about how the property
- may be derived. Such comments are informative; while they are intended
- to convey the intent of the derivation, in case of any mismatch between
- a statement of a derivation in a comment field and the actual
- listing of the derived property, the list is considered to be definitive.
- See <a href="#Simple_Derived">Simple and Derived Properties</a>.</p>
- <h3>3.9 <a name="Obsolete" href="#Obsolete">Obsolete Documentation Files</a></h3>
- <p>UCD.html was formerly the primary documentation file for the UCD. As of Version 5.2.0, its
- content has been wholly incorporated into this document.</p>
- <p>Unihan.html was formerly the primary documentation file for
- the Unihan Database. As of Version 5.1.0, its
- content has been wholly incorporated into [<a href="../tr41/tr41-21.html#UAX38">UAX38</a>].</p>
- <p>Versions of the Unicode Standard
- prior to Version 4.0.0 contained small, focused
- documentation files, UnicodeCharacterDatabase.html, PropList.html, and
- DerivedProperties.html, which were later consolidated into UCD.html.</p>
- <p>StandardizedVariants.html has been obsoleted as of Version 9.0.0.
- See <i>Section 3.4, <a href="#StandardizedVariants">StandardizedVariants.html</a></i>.</p>
- <h2>4 <a name="UCD_Files" href="#UCD_Files">UCD Files</a></h2>
-
- <p>The heart of the UCD consists of the data files themselves. This section
- describes the directory structure for the UCD, the format conventions
- for the data files, and provides documentation for data files not documented
- elsewhere in this annex.</p>
- <h3>4.1 <a name="Directory_Structure" href="#Directory_Structure">Directory Structure</a></h3>
- <p>Each version of the UCD is released in a separate, numbered directory
- under the <i>Public</i> directory on the Unicode website. The content of that
- directory is complete for that release. It is also stable—once released,
- it will be archived permanently in that directory, unchanged, at a stable URL.</p>
-
- <p>The specific files for the UCD associated with this version of
- the Unicode Standard (10.0.0) are located at:</p>
- <blockquote>
- <a href="http://www.unicode.org/Public/10.0.0/">http://www.unicode.org/Public/10.0.0/</a>
- </blockquote>
- <p>The latest released version of the UCD is always accessible via the
- following stable URL:</p>
- <blockquote>
- <a href="http://www.unicode.org/Public/UCD/latest/">http://www.unicode.org/Public/UCD/latest/</a>
- </blockquote>
- <p>Zipped copies of the latest released version of the UCD are always accessible via the
- following stable URL:</p>
- <blockquote>
- <a href="http://www.unicode.org/Public/zipped/latest/">http://www.unicode.org/Public/zipped/latest/</a>
- </blockquote>
- <p>Prior to Version 6.3.0, access to the latest released version
- of the UCD was via the following stable URL:</p>
- <blockquote>
- <a href="http://www.unicode.org/Public/UNIDATA/">http://www.unicode.org/Public/UNIDATA/</a>
- </blockquote>
- <p>That "UNIDATA" URL will be maintained, but is no longer recommended, because
- it points to the <i>ucd</i> subdirectory of the latest release, rather than to the parent
- directory for the release. The "UNIDATA" naming convention is also very old, and does not follow
- the directory naming conventions currently used for other data releases in the
- <i>Public</i> directory on the Unicode website.</p>
- <h4>4.1.1 <a name="UCD_Proper" href="#UCD_Proper">UCD Files Proper</a></h4>
- <p>The UCD proper is located in the <i>ucd</i> subdirectory of the numbered version
- directory. That directory contains all of the documentation files and most
- of the data files for the UCD, including some data files for derived properties.</p>
-
- <p>Although all UCD data files are version-specific for a release and most contain
- internal date and version stamps, the file names of the released data files do not
- differ from version to version. When linking to a version-specific data file, the
- version will be indicated by the version number of the directory for the release.</p>
-
- <p>All files for derived extracted properties are in the <i>extracted</i>
- subdirectory of the <i>ucd</i> subdirectory.
- See <a href="#Derived_Extracted">Derived Extracted Properties</a> for
- documentation regarding those data files and their content.</p>
- <p>A number of auxiliary properties are specified in files in the <i>auxiliary</i>
- subdirectory of the <i>ucd</i> subdirectory. It contains
- data files specifying properties associated with
- Unicode Standard Annex #29, "Unicode Text Segmentation" [<a href="../tr41/tr41-21.html#UAX29">UAX29</a>]
- and with
- Unicode Standard Annex #14, "Unicode Line Breaking Algorithm" [<a href="../tr41/tr41-21.html#UAX14">UAX14</a>],
- as well as test data for those algorithms.
- See <a href="#Segmentation_Test_Files">Segmentation Test Files and Documentation</a>
- for more information about the test data.</p>
-
- <h4>4.1.2 <a name="UCD_XML_Files" href="#UCD_XML_Files">UCD XML Files</a></h4>
- <p>The XML version of the UCD is located in the <i>ucdxml</i> subdirectory of the
- numbered version directory. See the <a href="#UCD_in_XML">UCD in XML</a> for
- more details.</p>
- <h4>4.1.3 <a name="Chart_Files" href="#Chart_Files">Charts</a></h4>
- <p>The code charts specific to a version of Unicode are archived
- as a single large pdf file in the <i>charts</i> subdirectory of the
- numbered version directory. See the readme.txt in that subdirectory
- and the general web page explaining the
- <a href="http://www.unicode.org/charts/About.html">Unicode Code Charts</a> for
- more details.</p>
-
- <h4>4.1.4 <a name="Beta_Review" href="#Beta_Review">Beta Review Considerations</a></h4>
- <p>Prior to the formal release for any particular version of the UCD, a beta
- review is conducted. The beta review files are located in the same directory
- that is later used for the released UCD, but during the beta review period,
- the subdirectory structure differs somewhat and may contain temporary files,
- including documentation of diffs between deltas for the beta review. Also,
- during the beta review, all data file names are suffixed with version
- numbers and delta numbers. So a typical file name during beta review
- may be "PropList-5.2.0d13.txt" instead of the finally released "PropList.txt".</p>
-
- <p>Notices contained in a ReadMe.txt file in the UCD directory during the
- beta review period also make it clear that that directory contains
- preliminary material under review, rather than a final, stable release.</p>
-
- <h4>4.1.5 <a name="Directory_History" href="#Directory_History">File Directory Differences for Early Releases</a></h4>
- <p>The <a href="#UCD_in_XML">UCD in XML</a> was introduced in Version 5.1.0,
- so UCD directories prior to that do not contain the <i>ucdxml</i> subdirectory.</p>
-
- <p>UCD directories prior to Version 4.1.0 do not contain the <i>auxiliary</i>
- subdirectory.</p>
-
- <p>UCD directories prior to Version 3.2.0 do not contain the <i>extracted</i>
- subdirectory.</p>
-
- <p>The general structure of the file directory for a released version of the UCD
- described above applies to Versions 4.1.0 and later. Prior to Version 4.1.0,
- versions of the UCD were not self-contained, complete sets of data files
- for that version, but instead only contained any new data files or any data files
- which had <i>changed</i> since the prior release.</p>
-
- <p>Because of this, the property files for a given version
- prior to Version 4.1.0 can be spread over several directories. Consult the
- component listings at
- <a href="http://www.unicode.org/versions/enumeratedversions.html">Enumerated Versions</a>
- to find out which files in which directories comprise a complete set of data
- files for that version.</p>
- <p>The directory naming conventions and the file naming conventions also
- differed prior to Version 4.1.0. So, for example, Version 4.0.0 of the UCD
- is contained in a directory named <i>4.0-Update</i>, and Version 4.0.1 of
- the UCD in a directory named <i>4.0-Update1</i>. Furthermore, for these
- earlier versions, the data file names <i>do</i> contain explicit version
- numbers.</p>
-
- <h3>4.2 <a name="Format_Conventions" href="#Format_Conventions">File Format Conventions</a></h3>
- <p>Files in the UCD use the format conventions described in
- this section, unless otherwise specified.</p>
- <h4>4.2.1 <a name="Data_Fields" href="#Data_Fields">Data Fields</a></h4>
- <ul>
- <li>Each line of data consists of fields separated by semicolons. The fields are numbered
- starting with zero.</li>
- <li>The first field (0) of each line in the Unicode Character Database files represents a code
- point or range. The remaining fields (1..n) are properties associated with that code point.</li>
- <li>Leading and trailing spaces within a field are not significant.
- However, no leading or trailing spaces
- are allowed in any field of UnicodeData.txt. For legacy reasons,
- no spaces are allowed before or after the semicolon in LineBreak.txt and in EastAsianWidth.txt.</li>
- <li>The Unihan data files in the UCD have a separate format, using tab characters
- instead of semicolons to separate fields. See [<a href="../tr41/tr41-21.html#UAX38">UAX38</a>]
- for the detailed specification of the format of the Unihan data files. The
- data files TangutSources.txt and NushuSources.txt also use this format.</li>
- </ul>
- <h4>4.2.2 <a name="Code_Points" href="#Code_Points">Code Points and Sequences</a></h4>
- <ul>
- <li>Code points are expressed as hexadecimal numbers with four to six digits.
- They are written without the "U+" prefix in
- all data files except the Unihan data files. The Unihan data files use the "U+" prefix for
- all Unicode code points, to distinguish them from other decimal and hexadecimal
- numerical references occurring in their data fields.</li>
- <li>When a data field contains a sequence of code points, spaces separate
- the code points.
- </li>
- </ul>
- <h4>4.2.3 <a name="Code_Point_Ranges" href="#Code_Point_Ranges">Code Point Ranges</a></h4>
- <ul>
- <li>A range of code points is specified by the form "X..Y".</li>
- <li>Each code point in a range has the
- associated property value specified on a data file. For example (from Blocks.txt):
- <blockquote>
- <pre>
- 0000..007F; Basic Latin
- 0080..00FF; Latin-1 Supplement
- </pre>
- </blockquote>
- </li>
- <li>For backward compatibility, ranges in the file UnicodeData.txt
- are specified by entries for the
- start and end characters of the range, rather than by the form "X..Y".
- The start character is indicated by a range identifier, followed by a comma
- and the string "First", in angle brackets. This entry takes the
- place of a regular character name in field 1 for that line.
- The end character is indicated on the next line with the same range identifier,
- followed by a comma and the string "Last", in angle brackets:
-
- <blockquote>
- <pre>
- 4E00;<CJK Ideograph, First>;Lo;0;L;;;;;N;;;;;
- 9FD5;<CJK Ideograph, Last>;Lo;0;L;;;;;N;;;;;
- </pre>
- </blockquote>
- For character ranges using this convention, the names of all characters in the range
- are algorithmically derivable.
- See <i>Section 4.8, Name</i>
- in [<a href="../tr41/tr41-21.html#Unicode">Unicode</a>] for more information on
- derivation of character names for such ranges.</li>
- </ul>
- <h4>4.2.4 <a name="Comments" href="#Comments">Comments</a></h4>
- <ul>
- <li>U+0023 NUMBER SIGN ("#") is used to indicate comments: all
- characters from the number sign to the end
- of the line are considered part of the comment, and are disregarded when parsing data.</li>
- <li>In many files, the comments on data
- lines use a common format, as illustrated here (from Scripts.txt):
- <blockquote>
- <pre>09B2 ; Bengali # Lo BENGALI LETTER LA</pre>
- </blockquote>
- </li>
- <li>The first part of a comment using this common format is the General_Category value,
- provided for information. This is followed by the character name for
- the code point in the first field (0).</li>
- <li>The printing of the General_Category value is suppressed in instances where
- it would be redundant, as for DerivedGeneralCategory.txt, in which the value
- of the property value in the data field is already the General_Category value.</li>
- <li>The symbol "L&"
- indicates characters of General_Category Lu, Ll, or Lt (uppercase, lowercase,
- or titlecase letter). For example:
- <blockquote>
- <pre>0386 ; Greek # L& GREEK CAPITAL LETTER ALPHA WITH TONOS</pre>
- </blockquote>
- L& as used in these comments is an alias for
- the derived LC value (cased letter) for the General_Category property, as documented in
- PropertyValueAliases.txt.</li>
- <li>When the data line contains a range of code points, this common format
- for a comment also indicates a range of character names, separated by "..", as
- illustrated here (from DerivedNumericType.txt):
- <blockquote>
- <pre>00BC..00BE ; Numeric # No [3] VULGAR FRACTION ONE QUARTER..VULGAR FRACTION THREE QUARTERS</pre>
- </blockquote>
- </li>
- <li>Normally, consecutive characters with the same property value would be
- represented by a single code point range. In data files using this
- comment convention, such ranges are subdivided so that all
- characters in a range also
- have the same General_Category value (or LC).
- While this convention results in more ranges than are strictly necessary, it
- makes the contents of the ranges clearer.</li>
- <li>When a code point range occurs, the number of items in the range is
- included in the comment (in square brackets), immediately following the General_Category value.</li>
- <li>The comments are purely informational, and may change format or be omitted in the
- future. They should not be parsed for content.</li>
- </ul>
-
- <h4>4.2.5 <a name="Code_Point_Labels" href="#Code_Point_Labels">Code Point Labels</a></h4>
- <ul>
- <li>Surrogate code points, private-use characters, control codes, noncharacters,
- and unassigned code points have no names. When such code points are
- listed in the data files, for example to list their General_Category
- values, the comments use code point labels instead of character
- names. For example (from DerivedCoreProperties.txt):
- <blockquote>
- <pre>2065 ; Default_Ignorable_Code_Point # Cn <reserved-2065></pre>
- </blockquote>
- </li>
- <li>Code point labels use one of the tags as documented in
- <i>Section 4.8, Name</i>
- in [<a href="../tr41/tr41-21.html#Unicode">Unicode</a>] and as shown in <i>Table 3</i>,
- followed by "-" and the code point expressed in hexadecimal. The
- entire label is then enclosed in angle brackets.</li>
- </ul>
-
- <p class="caption">Table 3. <a name="Label_Tags_Table" href="#Label_Tags_Table">Code Point Label Tags</a></p>
- <div align="center">
-
- <table class="simple">
- <tr>
- <th>Tag</th>
- <th>General_Category</th>
- <th>Note</th>
- </tr>
- <tr>
- <td>reserved</td>
- <td>Cn</td>
- <td>Noncharacter_Code_Point=F</td>
- </tr>
- <tr>
- <td>noncharacter</td>
- <td>Cn</td>
- <td>Noncharacter_Code_Point=T</td>
- </tr>
- <tr>
- <td>control</td>
- <td>Cc</td>
- <td> </td>
- </tr>
- <tr>
- <td>private-use</td>
- <td>Co</td>
- <td> </td>
- </tr>
- <tr>
- <td>surrogate</td>
- <td>Cs</td>
- <td> </td>
- </tr>
- </table>
- </div>
-
- <p> </p>
-
- <h4>4.2.6 <a name="Multiple_Properties" href="#Multiple_Properties">Multiple Properties in One Data File</a></h4>
- <ul>
- <li>When a file contains the specification for multiple properties, the second field specifies the name
- of the property and the third field specifies the property value. For example (from
- DerivedNormalizationProps.txt):
- <blockquote>
- <pre>
- 03D2 ; FC_NFKC; 03C5 # L& GREEK UPSILON WITH HOOK SYMBOL
- 03D3 ; FC_NFKC; 03CD # L& GREEK UPSILON WITH ACUTE AND HOOK SYMBOL
- </pre>
- </blockquote>
- </li>
- </ul>
- <h4>4.2.7 <a name="Binary_Values" href="#Binary_Values">Binary Property Values</a></h4>
- <ul>
- <li>For binary properties, the second field specifies the name of the applicable property, with
- the implied value of the property being "True". Only the ranges of characters with the binary
- property value of "Y" (= True) are listed. For example (from PropList.txt):
- <blockquote>
- <pre>
- 1680 ; White_Space # Zs OGHAM SPACE MARK
- 2000..200A ; White_Space # Zs [11] EN QUAD..HAIR SPACE
- </pre>
- </blockquote>
- </li>
- </ul>
- <h4>4.2.8 <a name="Multiple_Values" href="#Multiple_Values">Multiple Values for Properties</a></h4>
- <ul>
- <li>When a data file defines a property which may take multiple values for a single code
- point, the multiple values are expressed in a space-delimited list. For example (from ScriptExtensions.txt):
- <blockquote>
- <pre>
- 0640 ; Adlm Arab Mand Mani Phlp Syrc # Lm ARABIC TATWEEL
- </pre>
- </blockquote>
- </li>
- <li>In some cases—but not all—the order of multiple elements in a space-delimited
- list may be significant. When the order of multiple elements is significant, it is documented
- along with the property itself. For example (from Unihan_Readings.txt), for the tag kMandarin,
- when there are two values for a code point, the first value is used to
- indicate a preferred pronunciation for zh-Hans (CN) and the second a
- preferred pronunciation for zh-Hant (TW).
- </li>
- <li>For further discussion, see Section 5.7.6 <a href="#Property_Values_As_Sets">Properties Whose Values Are Sets of Values</a>.</li>
- </ul>
- <h4>4.2.9 <a name="Default_Values" href="#Default_Values">Default Values</a></h4>
- <ul>
- <li>Entries for a code point may be omitted in a data file if the
- code point has a default value for the property in question.</li>
-
- <li>For string properties,
- including the definition of foldings, the
- default value is the code point of the character itself.</li>
-
- <li>For miscellaneous properties which take strings as values,
- such as the Unicode Name property, the default value is a null
- string.</li>
-
- <li>For binary properties, the default value is always "N" (= False)
- and is always omitted.</li>
-
- <li>For enumerated and catalog properties, the default value is listed in a comment. For
- example (from Scripts.txt):
- <blockquote>
- <pre>
- # All code points not explicitly listed for Script
- # have the value Unknown (Zzzz).
- </pre>
- </blockquote>
- </li>
-
- <li>A few properties of the enumerated type have multiple default values. In
- those cases, comments in the file explain the code point ranges for applicable values.
- See also <a href="#Default_Values_Table"><i>Table 4</i></a>.</li>
-
- <li>Default values are also listed in specially-formatted comment lines,
- using the keyword "@missing". Parsers which extract and process
- these lines can algorithmically determine the default values for all code points.
- See <a href="#Missing_Conventions">@missing Conventions</a>
- for details about the syntax and use of these lines.
- </li>
-
- <li>Because of the legacy format constraints for UnicodeData.txt, that
- file contains no specific information about default values for properties.
- The default values for fields in UnicodeData.txt are documented
- in <a href="#Default_Values_Table"><i>Table 4</i></a> below
- if they cannot be derived from the general rules about default values
- for properties.</li>
-
- <li>The file ArabicShaping.txt is also exceptional, because it omits the listing
- of many characters whose property value (jt=T) can be derived by rule. Adding an "@missing" line
- to that file would result in the wrong interpretation of Joining_Type values for omitted characters.
- The full explicit listing of Joining_Type values and the correct "@missing" line for
- the default Joining_Type value (jt=U) can be found in the file DerivedJoiningType.txt instead.</li>
- </ul>
-
- <p>Default values for common catalog, enumeration, and
- numeric properties are listed in <i>Table 4</i>.
- Further explanation is provided below the table, in
- those cases where the default values
- are complex, as indicated in the third column.</p>
-
- <p class="caption">Table 4. <a name="Default_Values_Table" href="#Default_Values_Table">Default Values for Properties</a></p>
- <div align="center">
- <table class="simple">
- <tr>
- <th>Property Name</th>
- <th>Default Value(s)</th>
- <th>Complex?</th>
- </tr>
- <tr>
- <td>Age</td>
- <td>Unassigned (= NA)</td>
- <td>No</td>
- </tr>
- <tr>
- <td>Bidi_Class</td>
- <td>L, AL, R, BN, ET</td>
- <td>Yes</td>
- </tr>
- <tr>
- <td>Block</td>
- <td>No_Block</td>
- <td>No</td>
- </tr>
- <tr>
- <td>Canonical_Combining_Class</td>
- <td>Not_Reordered (= 0)</td>
- <td>No</td>
- </tr>
- <tr>
- <td>Decomposition_Type</td>
- <td>None</td>
- <td>No</td>
- </tr>
- <tr>
- <td>East_Asian_Width</td>
- <td>Neutral (= N), Wide (= W)</td>
- <td>Yes</td>
- </tr>
- <tr>
- <td>General_Category</td>
- <td>Cn</td>
- <td>No</td>
- </tr>
- <tr>
- <td>Line_Break</td>
- <td>Unknown (= XX), ID, PR</td>
- <td>Yes</td>
- </tr>
- <tr>
- <td>Numeric_Type</td>
- <td>None</td>
- <td>No</td>
- </tr>
- <tr>
- <td>Numeric_Value</td>
- <td>NaN</td>
- <td>No</td>
- </tr>
- <tr>
- <td>Script</td>
- <td>Unknown (= Zzzz)</td>
- <td>No</td>
- </tr>
- <tr>
- <td>Vertical_Orientation</td>
- <td>Rotated (= R), Upright (= U)</td>
- <td>Yes</td>
- </tr>
- </table>
- </div>
-
- <p><i>Complex default values</i> are those which take multiple values, contingent on
- code point ranges or other conditions. Complex default values other than those specified in the
- "@missing" line are explicitly listed in the relevant property file, except for instances
- noted in this section. This means that a parser extracting property values from
- the UCD should never encounter an ambiguous condition for which the default value of a property
- for a particular code point is unclear.</p>
-
- <p>Default values for the
- <a href="#Bidi_Class">Bidi_Class</a> property are complex. See
- Unicode Standard Annex #9, "Unicode Bidirectional Algorithm" [<a href="../tr41/tr41-21.html#UAX9">UAX9</a>]
- and DerivedBidiClass.txt for full details.</p>
- <p>Default values for the <a href="#East_Asian_Width">East_Asian_Width</a>
- property are complex. This property defaults to Neutral for most code points, but defaults to Wide
- for unassigned code points in blocks associated with CJK ideographs.
- See Unicode Standard Annex #11, "East Asian Width"
- [<a href="../tr41/tr41-21.html#UAX11">UAX11</a>] and
- EastAsianWidth.txt for documentation of the default values
- and DerivedEastAsianWidth.txt for the full listing of values.</p>
-
- <p>Default values for the <a href="#Line_Break">Line_Break</a>
- property are complex. This property defaults to Unknown for most code points, but defaults to ID
- for unassigned code points in blocks associated with CJK ideographs, and
- in blocks in the range U+1F000..U+1FFFD.
- The property defaults to PR for unassigned code
- points in the Currency Symbols block. See Unicode Standard Annex #14, "Unicode Line Breaking Algorithm"
- [<a href="../tr41/tr41-21.html#UAX14">UAX14</a>]
- and LineBreak.txt for documentation of the default values
- and DerivedLineBreak.txt for the full listing of values.</p>
-
- <p>Default values for the <a href="#Vertical_Orientation">Vertical_Orientation</a>
- property are complex. This property defaults to Rotated (R) for most code points,
- but defaults to Upright (U)
- for unassigned code points in blocks associated with scripts that are themselves predominantly Upright.
- See Unicode Standard Annex #50, "Unicode Vertical Text Layout"
- [<a href="../tr41/tr41-21.html#UAX50">UAX50</a>] and VerticalOrientation.txt for full details.</p>
-
- <h4>4.2.10 <a name="Missing_Conventions" href="#Missing_Conventions">@missing Conventions</a></h4>
- <p>Specially-formatted comment lines with the keyword "@missing" are
- used to define default property values for ranges of code points not explicitly listed
- in a data file. These lines follow regular conventions that make them
- machine-readable.</p>
- <p>An @missing line starts with the comment character "#", followed by
- a space, then the "@missing" keyword, followed by a colon, another space, a code
- point range, and a semicolon. Then the
- line typically continues with a semicolon-delimited list of one or more
- default property values. For example:</p>
- <blockquote>
- <pre>
- # @missing: 0000..10FFFF; Unknown
- </pre>
- </blockquote>
- <p>In general, the code point range and semicolon-delimited list follow
- the same syntactic conventions as the data file in which the @missing line occurs, so
- that any parser which interprets that data file can easily be adapted to also
- parse and interpret an @missing line to pick up default property values for code points.</p>
- <p>@missing lines are also supplied for many properties in the file
- PropertyValueAliases.txt. In this case, because there are many @missing lines in that
- single data file, each @missing line contains an additional second field specifying the
- property name for which it defines a default value.</p>
- <p>An @missing line is never provided for a binary property, because the
- default value for binary properties is always "N" and need not be defined redundantly
- for each binary property.</p>
- <p>Because of the
- addition of property names when @missing lines are included in PropertyValueAliases.txt,
- there are currently two syntactic patterns used for @missing lines, as
- summarized schematically below:</p>
- <ol>
- <li>code_point_range; default_prop_val</li>
- <li>code_point_range; property_name; default_prop_val</li>
- </ol>
- <p>In this schematic representation, "default_prop_val" stands in for
- either an explicit property value or for a special tag such as <none> or
- <script>.</p>
- <p>Pattern #1 is used in most primary and derived UCD files. For example:</p>
- <blockquote>
- <pre>
- # @missing: 0000..10FFFF; <none>
- </pre>
- </blockquote>
- <p>Pattern #2 is used in PropertyValueAliases.txt and in
- DerivedNormalizationProps.txt, both of which contain values associated with many
- properties. For example:</p>
- <blockquote>
- <pre>
- # @missing: 0000..10FFFF; NFD_QC; Yes
- </pre>
- </blockquote>
- <p>The special tag values which may occur in the default_prop_val field
- in an @missing line are interpreted as follows:</p>
- <div align="center">
- <table class="simple">
- <tr>
- <th>Tag</th>
- <th>Interpretation</th>
- </tr>
- <tr>
- <td><none></td>
- <td>the empty string</td>
- </tr>
- <tr>
- <td><code point></td>
- <td>the string representation of the code point value</td>
- </tr>
- <tr>
- <td><script></td>
- <td>the value equal to the Script property value for this code point</td>
- </tr>
- </table>
- </div>
- <p> </p>
- <h4>4.2.11 <a name="Empty_Fields" href="#Empty_Fields">Empty Fields</a></h4>
- <p>The data file UnicodeData.txt defines many property values in each record. When a
- field in a data line for a code point is empty, that indicates that the property takes
- the default value for that code point. For example:</p>
- <blockquote>
- <pre>
- 0022;QUOTATION MARK;Po;0;ON;;;;;N;;;;;
- </pre>
- </blockquote>
-
- <p>In that data line, the empty numeric fields indicate that the value of Numeric_Value for
- U+0022 is NaN and that the value of Numeric_Type is None. The empty case mapping fields indicate
- that the value of Simple_Uppercase_Mapping for U+0022 takes the default value, namely the
- code point itself, and so forth.</p>
- <p>The interpretation of empty fields in other data files of the UCD differs. In the
- case of data files which define string properties, the omission of an entry for a code point
- indicates that the property takes the default value for that code point. However, if there
- is an entry for a code point, but the property value field for that entry is empty, that
- indicates that the property value is an explicit empty string (""). For example, the derived string
- property <a href="#NFKC_Casefold">NFKC_Casefold</a> may map a code point to a sequence of code points, to a single different code
- point, to the same single code point, or to no code point at all (an empty string). See the following entries from
- the data file DerivedNormalizationProps.txt:</p>
- <blockquote>
- <pre>
- 00AA ; NFKC_CF; 0061 # Lo FEMININE ORDINAL INDICATOR
- 00AD ; NFKC_CF; # Cf SOFT HYPHEN
- 00AF ; NFKC_CF; 0020 0304 # Sk MACRON
- </pre>
- </blockquote>
-
- <p>The empty field for U+00AD indicates that the property NFKC_Casefold maps SOFT HYPHEN
- to an empty string. By contrast, the absence of the entry for U+00AE in the data file indicates
- that the property NFKC_Casefold maps U+00AE REGISTERED SIGN to itself—the default value.</p>
-
- <h4>4.2.12 <a name="Text_Encoding" href="#Text_Encoding">Text Encoding</a></h4>
- <ul>
- <li>The data files use UTF-8. Unless otherwise noted, non-ASCII characters only
- appear in comments.</li>
- <li>The Unihan data files in the UCD make extensive use of UTF-8 in data fields.
- (See [<a href="../tr41/tr41-21.html#UAX38">UAX38</a>] for details.)</li>
- <li>For legacy reasons, NamesList.txt was exceptional; it was encoded
- in Latin-1 prior to Unicode 6.2. For
- Unicode 6.2 and later, the encoding is UTF-8. See <a href="#NamesList">NamesList.html</a>.</li>
- <li>Segmentation test data files, such as WordBreakTest.txt, make
- use of non-ASCII (UTF-8) characters as delimiters for data fields.</li>
- </ul>
-
- <h4>4.2.13 <a name="Line_Termination" href="#Line_Termination">Line Termination</a></h4>
- <ul>
- <li>All data files in the UCD use LF line termination (not CRLF line termination).
- When copied to different systems, these line endings may be automatically changed to
- use the native line termination conventions for that system. Make sure your editor (or parser) can
- deal with the line termination
- style in the local copy of the data files.</li>
- </ul>
-
- <h4>4.2.14 <a name="Other_Conventions" href="#Other_Conventions">Other Conventions</a></h4>
- <ul>
- <li>In some test data files, segments of the test data are distinguished by a line
- starting with an "@" sign. For example (from NormalizationTest.txt):
- <blockquote>
- <pre>
- @Part1 # Character by character test
- </pre>
- </blockquote>
- </li>
- </ul>
- <h4>4.2.15 <a name="Other_File_Formats" href="#Other_File_Formats">Other File Formats</a></h4>
- <ul>
- <li>The data format for Unihan data files and for
- TangutSources.txt and NushuSources.txt
- in the UCD differs from the standard format.
- See the discussion of <a href="#Unihan">Unihan and UAX #38</a>
- earlier in this annex for more information.</li>
- <li>The format for NamesList.txt, which documents the Unicode names
- list and which is used programmatically to drive the formatting
- program for Unicode code charts, also differs significantly from regular UCD data files.
- See <a href="#NamesList">NamesList.html</a></li>
- <li>Index.txt is another exception. It uses a tab-delimited format, with field 0
- consisting of an index entry string, and field 1 a code point. Index.txt is used to
- maintain the <a href="http://www.unicode.org/charts/charindex.html">
- Unicode Character Name Index</a>.</li>
- <li>The various segmentation test data files make use of "#" to delimit comments,
- but have distinct conventions for their data fields. See the documentation
- in their header sections for details of the data field formats for
- those files.</li>
- <li>The XML version of the UCD has its own file format conventions.
- In those files, "#" is used to stand for the code point in
- algorithmically derivable character names such as CJK UNIFIED IDEOGRAPH-4E00
- or TANGUT IDEOGRAPH-17000,
- so as to allow for name sharing in more compact representations of the data.
- See Unicode Standard Annex #42, "Unicode Character Database in XML"
- [<a href="../tr41/tr41-21.html#UAX42">UAX42</a>] for details.</li>
- </ul>
- <h3>4.3 <a name="File_List" href="#File_List">File List</a></h3>
- <p>The exact list of files associated with any particular version of the UCD is
- available on the Unicode website by referring to the component listings at
- <a href="http://www.unicode.org/versions/enumeratedversions.html">Enumerated Versions</a>.</p>
-
- <p>The majority of the data files in the UCD provide specifications of
- character properties for Unicode characters. Those files and their contents
- are documented in detail in the <a href="#Property_Definitions">Property Definitions</a> section
- below.</p>
-
- <p>The data files in the <i>extracted</i> subdirectory constitute reformatted listings
- of single character properties extracted from UnicodeData.txt or other primary
- data files. The reformatting is provided to make it easier to see the particular set
- of characters having certain values for enumerated properties, or to separate
- the statement of that property from other properties defined together
- in UnicodeData.txt. These files also include explicit
- listings of default values for the respective properties. These extracted, derived data files are further documented in
- the <a href="#Derived_Extracted">Derived Extracted Properties</a> section below.</p>
-
- <p>The UCD also contains a number of test data files, whose purpose is to provide
- standard test cases useful in verifying the implementation of complex Unicode
- algorithms. See the <a href="#Test_Files">Test Files</a> section below for more
- documentation.</p>
- <p>The remaining files in the Unicode Character Database do not directly specify Unicode
- properties. The important ones and their functions are listed in <i>Table 5</i>.
- The Status column indicates whether the file (and its content) is considered
- <b>N</b>ormative, <b>I</b>nformative, or <b>P</b>rovisional.</p>
-
- <p class="caption">Table 5. <a name="UCD_Files_Table" href="#UCD_Files_Table">Files in the UCD</a></p>
- <table class="simple">
- <tr>
- <th>File Name</th>
- <th>Reference</th>
- <th>Status</th>
- <th>Description</th>
- </tr>
- <tr>
- <td>CJKRadicals.txt</td>
- <td>[<a href="../tr41/tr41-21.html#UAX38">UAX38</a>]</td>
- <td style="text-align:center">I</td>
- <td>List of Unified CJK Ideographs and CJK Radicals that correspond to
- specific radical numbers used in the CJK radical stroke counts.</td>
- </tr>
- <tr>
- <td>USourceData.txt</td>
- <td>[<a href="../tr41/tr41-21.html#UAX45">UAX45</a>]</td>
- <td style="text-align:center">N</td>
- <td>The list of formal references for UTC-Source ideographs, together with data regarding
- their status and sources.</td>
- </tr>
- <tr>
- <td>USourceGlyphs.pdf</td>
- <td>[<a href="../tr41/tr41-21.html#UAX45">UAX45</a>]</td>
- <td style="text-align:center">I</td>
- <td>A table containing a representative glyph for each UTC-Source ideograph.</td>
- </tr>
- <tr>
- <td>TangutSources.txt</td>
- <td>Chapter 18</td>
- <td style="text-align:center">N</td>
- <td>Specifies normative source mappings for
- Tangut ideographs and components. This data
- file also includes informative radical-stroke values that are used in
- the preparation of the code charts for the Tangut blocks.<br>
- <b>kTGT_MergedSrc</b>: normative source mapping to various Tangut source references<br>
- <b>kRSTUnicode</b>: informative radical-stroke value</td>
- </tr>
- <tr>
- <td>NushuSources.txt</td>
- <td>Chapter 18</td>
- <td style="text-align:center">N</td>
- <td>Specifies normative source mappings for Nushu ideographs. This data
- file also includes informative readings for Nushu characters.<br>
- <b>kSrc_NushuDuben</b>: normative source mapping to the Nushu Duben<br>
- <b>kReading</b>: informative example phonetic reading</td>
- </tr>
- <tr>
- <td>EmojiSources.txt</td>
- <td>Chapter 22</td>
- <td style="text-align:center">N</td>
- <td>Specifies source mappings to SJIS values for emoji symbols in the original implementations
- of these symbols by Japanese telecommunications companies.</td>
- </tr>
- <tr>
- <td>Index.txt</td>
- <td>Chapter 24</td>
- <td style="text-align:center">I</td>
- <td>Index to Unicode characters.</td>
- </tr>
- <tr>
- <td>NamesList.txt</td>
- <td>Chapter 24</td>
- <td style="text-align:center">I</td>
- <td>Names list used for production of the code charts, derived from UnicodeData.txt.
- It contains additional annotations.</td>
- </tr>
- <tr>
- <td><a href="#NamesList">NamesList.html</a></td>
- <td>Chapter 24</td>
- <td style="text-align:center">I</td>
- <td>Documents the format of NamesList.txt. </td>
- </tr>
- <tr>
- <td>StandardizedVariants.txt</td>
- <td>Chapter 23</td>
- <td style="text-align:center">N</td>
- <td>Lists all the standardized variant sequences that have been defined, plus a textual description of
- their desired appearance.</td>
- </tr>
- <tr>
- <td><a href="#StandardizedVariants">StandardizedVariants.html</a></td>
- <td>Chapter 23</td>
- <td style="text-align:center">N</td>
- <td>An obsolete derived documentation file.</td>
- </tr>
- <tr>
- <td>NamedSequences.txt</td>
- <td>[<a href="../tr41/tr41-21.html#UAX34">UAX34</a>]</td>
- <td style="text-align:center">N</td>
- <td>Lists the names for all approved named sequences.</td>
- </tr>
- <tr>
- <td>NamedSequencesProv.txt</td>
- <td>[<a href="../tr41/tr41-21.html#UAX34">UAX34</a>]</td>
- <td style="text-align:center">P</td>
- <td>Lists the names for all provisional named sequences.</td>
- </tr>
- </table>
-
- <p>For more information about these files and their use, see the referenced annexes or
- chapters of Unicode Standard.</p>
-
- <h3>4.4 <a name="Zipped_Files" href="#Zipped_Files">Zipped Files</a></h3>
- <p>Starting with Version 4.1.0, zipped versions of all of the UCD files,
- both data files and documentation files, are available under the <i>Public/zipped</i>
- directory on the Unicode website. Each collection of zipped files is located
- there in a numbered subdirectory corresponding to that version of the UCD.</p>
-
- <p>Two different zipped files are provided for each version:</p>
- <ul>
- <li><b>Unihan.zip</b> is the zipped version of the very large Unihan data
- files</li>
- <li><b>UCD.zip</b> is the zipped
- version of all of the rest of the UCD data files, excluding
- the Unihan data files.</li>
- </ul>
-
- <p>This bifurcation allows for better management of downloading version-specific
- information, because Unihan.zip contains all the pertinent CJK-related
- property information, while UCD.zip contains all of the rest of the UCD
- property information, for those who may not need the voluminous CJK data.</p>
- <p>Starting with Version 6.1.0 the main versioned directories for the UCD also contain a copy
- of UCD.zip, for convenience in access.</p>
-
- <p>In versions of the UCD prior to Version 4.1.0, zipped copies of the
- Unihan data files (which for those versions were released as a single large text file, Unihan.txt)
- are provided in the same directory as the UCD data files. These zipped files are only posted
- for versions of the UCD in which Unihan.txt was updated.</p>
- <h3>4.5 <a name="UCD_in_XML" href="#UCD_in_XML">UCD in XML</a></h3>
- <p>Starting with Version 5.1.0, a set of XML data
- files are also released with each version of the UCD. Those
- data files make it possible to import and process the UCD property data using
- standard XML parsing tools, instead of the specialized parsing required for the
- various individual data files of the UCD.</p>
- <h4>4.5.1 <a name="UAX42_doc" href="#UAX42_doc">UAX #42</a></h4>
- <p>Unicode Standard Annex #42, "Unicode Character Database in XML" [<a href="../tr41/tr41-21.html#UAX42">UAX42</a>]
- defines an XML schema
- which is used to incorporate all of the Unicode character property information
- into the XML version of the UCD. See that annex for details of the
- schema and conventions regarding the grouping of property values for
- more compact representations.</p>
- <h4>4.5.2 <a name="XML_files" href="#XML_files">XML File List</a></h4>
- <p>The XML version of the UCD is contained in the <i>ucdxml</i> subdirectory
- of the UCD. The files are all zipped. The list of files is shown in
- <i>Table 6</i>.</p>
- <p class="caption">Table 6. <a name="XML_Files_Table" href="#XML_Files_Table">XML File List</a></p>
- <div align="center">
- <table class="simple">
- <tr>
- <th>File Name</th>
- <th>CJK</th>
- <th>non-CJK</th>
- </tr>
- <tr>
- <td>ucd.all.flat.zip</td>
- <td style="text-align:center">x</td>
- <td style="text-align:center">x</td>
- </tr>
- <tr>
- <td>ucd.all.grouped.zip</td>
- <td style="text-align:center">x</td>
- <td style="text-align:center">x</td>
- </tr>
- <tr>
- <td>ucd.nounihan.flat.zip</td>
- <td> </td>
- <td style="text-align:center">x</td>
- </tr>
- <tr>
- <td>ucd.nounihan.grouped.zip</td>
- <td> </td>
- <td style="text-align:center">x</td>
- </tr>
- <tr>
- <td>ucd.unihan.flat.zip</td>
- <td style="text-align:center">x</td>
- <td> </td>
- </tr>
- <tr>
- <td>ucd.unihan.grouped.zip</td>
- <td style="text-align:center">x</td>
- <td> </td>
- </tr>
- </table>
- </div>
-
- <p>The "flat" file versions simply list all attributes with no
- particular compression. The "grouped" file versions apply the
- grouping mechanism described in [<a href="../tr41/tr41-21.html#UAX42">UAX42</a>]
- to cut down on the size of the data files.</p>
- <h2>5 <a name="Properties" href="#Properties">Properties</a></h2>
- <p>This section documents the Unicode character properties, relating them
- in detail to the particular UCD data files in which they are specified.
- For enumerated properties in particular, this section also documents the
- actual values which those properties can have.</p>
-
- <h3>5.1 <a name="Property_Index" href="#Property_Index">Property Index</a></h3>
- <p><i>Table 7</i> provides a summary list of the Unicode character properties,
- excluding most of those specific to the Unihan
- data files. For a comparable
- index of CJK character properties, see Unicode Standard Annex #38, "Unicode Han Database (Unihan)"
- [<a href="../tr41/tr41-21.html#UAX38">UAX38</a>].</p>
-
- <p>The properties are roughly organized into groups
- based on their usage. This grouping is primarily for documentation convenience and
- except for <a href="#Contributory_Properties">contributory properties</a>, has no
- normative implications. Contributory properties are
- shown in this index with a <span class="lightgray">gray background</span>, to better distinguish them visually from
- ordinary (simple or derived) properties.
- Deprecated properties and other properties
- not recommended for support in public <a href="#Property_APIs">property APIs</a> are also shown
- with a <span class="lightgray">gray background</span>.
- The link on each property leads to its
- description in
- <i>Table 9, <a href="#Property_List_Table">Property Table</a></i>.
- Any property marked as
- <a href="#Deprecated_Properties">deprecated</a> in this index is
- also automatically considered <a href="#Obsolete_Properties">obsolete</a>.</p>
-
- <p class="caption">Table 7. <a name="Property_Index_Table" href="#Property_Index_Table">Property Index by Scope of Use</a></p>
- <div align="center">
- <table class="simple">
- <tr>
- <th width="33%">General</th>
- <th width="33%">Normalization</th>
- <th width="33%">CJK</th>
- </tr>
- <tr>
- <td><a href="#Name">Name</a></td>
- <td><a href="#Canonical_Combining_Class">Canonical_Combining_Class</a></td>
- <td><a href="#Ideographic">Ideographic</a></td>
- </tr>
- <tr>
- <td><a href="#Name_Alias">Name_Alias</a></td>
- <td class="lightgray"><a href="#Decomposition_Mapping">Decomposition_Mapping</a></td>
- <td><a href="#Unified_Ideograph">Unified_Ideograph</a></td>
- </tr>
- <tr>
- <td><a href="#Block">Block</a></td>
- <td class="lightgray"><a href="#Composition_Exclusion">Composition_Exclusion</a></td>
- <td><a href="#Radical">Radical</a></td>
- </tr>
- <tr>
- <td><a href="#Age">Age</a></td>
- <td class="lightgray"><a href="#Full_Composition_Exclusion">Full_Composition_Exclusion</a></td>
- <td><a href="#IDS_Binary_Operator">IDS_Binary_Operator</a></td>
- </tr>
- <tr>
- <td><a href="#General_Category">General_Category</a></td>
- <td><a href="#Decomposition_Type">Decomposition_Type</a></td>
- <td><a href="#IDS_Trinary_Operator">IDS_Trinary_Operator</a></td>
- </tr>
- <tr>
- <td><a href="#Script">Script</a></td>
- <td class="lightgray"><a href="#FC_NFKC_Closure">FC_NFKC_Closure</a> (deprecated)</td>
- <td><a href="#Unicode_Radical_Stroke">Unicode_Radical_Stroke</a></td>
- </tr>
- <tr>
- <td><a href="#Script_Extensions">Script_Extensions</a></td>
- <td> </td>
- <td> </td>
- </tr>
- <tr>
- <td><a href="#White_Space">White_Space</a></td>
- <td><a href="#NFC_Quick_Check">NFC_Quick_Check</a></td>
- <th>Miscellaneous</th>
- </tr>
- <tr>
- <td><a href="#Alphabetic">Alphabetic</a></td>
- <td><a href="#NFKC_Quick_Check">NFKC_Quick_Check</a></td>
- <td><a href="#Math">Math</a></td>
- </tr>
- <tr>
- <td><a href="#Hangul_Syllable_Type">Hangul_Syllable_Type</a></td>
- <td><a href="#NFD_Quick_Check">NFD_Quick_Check</a></td>
- <td><a href="#Quotation_Mark">Quotation_Mark</a></td>
- </tr>
- <tr>
- <td><a href="#Noncharacter_Code_Point">Noncharacter_Code_Point</a></td>
- <td><a href="#NFKD_Quick_Check">NFKD_Quick_Check</a></td>
- <td><a href="#Dash">Dash</a></td>
- </tr>
- <tr>
- <td><a href="#Default_Ignorable_Code_Point">Default_Ignorable_Code_Point</a></td>
- <td class="lightgray"><a href="#Expands_On_NFC">Expands_On_NFC</a> (deprecated)</td>
- <td class="lightgray"><a href="#Hyphen">Hyphen</a> (deprecated, stabilized)</td>
- </tr>
- <tr>
- <td><a href="#Deprecated">Deprecated</a></td>
- <td class="lightgray"><a href="#Expands_On_NFD">Expands_On_NFD</a> (deprecated)</td>
- <td><a href="#STerm">Sentence_Terminal</a></td>
- </tr>
- <tr>
- <td><a href="#Logical_Order_Exception">Logical_Order_Exception</a></td>
- <td class="lightgray"><a href="#Expands_On_NFKC">Expands_On_NFKC</a> (deprecated)</td>
- <td><a href="#Terminal_Punctuation">Terminal_Punctuation</a></td>
- </tr>
- <tr>
- <td><a href="#Variation_Selector">Variation_Selector</a></td>
- <td class="lightgray"><a href="#Expands_On_NFKD">Expands_On_NFKD</a> (deprecated)</td>
- <td><a href="#Diacritic">Diacritic</a></td>
- </tr>
- <tr>
- <th>Case</th>
- <td><a href="#NFKC_Casefold">NFKC_Casefold</a></td>
- <td><a href="#Extender">Extender</a></td>
- </tr>
- <tr>
- <td><a href="#Uppercase">Uppercase</a></td>
- <td><a href="#CWKCF">Changes_When_NFKC_Casefolded</a></td>
- <td><a href="#Grapheme_Base">Grapheme_Base</a></td>
- </tr>
- <tr>
- <td><a href="#Lowercase">Lowercase</a></td>
- <th>Shaping and Rendering</th>
- <td><a href="#Grapheme_Extend">Grapheme_Extend</a></td>
- </tr>
- <tr>
- <td><a href="#Lowercase_Mapping">Lowercase_Mapping</a></td>
- <td><a href="#Join_Control">Join_Control</a></td>
- <td class="lightgray"><a href="#Grapheme_Link">Grapheme_Link</a> (deprecated)</td>
- </tr>
- <tr>
- <td><a href="#Titlecase_Mapping">Titlecase_Mapping</a></td>
- <td><a href="#Joining_Group">Joining_Group</a></td>
- <td><a href="#Unicode_1_Name">Unicode_1_Name</a></td>
- </tr>
- <tr>
- <td><a href="#Uppercase_Mapping">Uppercase_Mapping</a></td>
- <td><a href="#Joining_Type">Joining_Type</a></td>
- <td class="lightgray"><a href="#ISO_Comment">ISO_Comment</a> (deprecated, stabilized)</td>
- </tr>
- <tr>
- <td> </td>
- <td><a href="#Vertical_Orientation">Vertical_Orientation</a></td>
- <td><a href="#Regional_Indicator">Regional_Indicator</a></td>
- </tr>
- <tr>
- <td><a href="#Case_Folding">Case_Folding</a></td>
- <td><a href="#Line_Break">Line_Break</a></td>
- <td><a href="#Indic_Positional_Category">Indic_Positional_Category</a></td>
- </tr>
- <tr>
- <td><a href="#Simple_Lowercase_Mapping">Simple_Lowercase_Mapping</a></td>
- <td><a href="#Grapheme_Cluster_Break">Grapheme_Cluster_Break</a></td>
- <td><a href="#Indic_Syllabic_Category">Indic_Syllabic_Category</a></td>
- </tr>
- <tr>
- <td><a href="#Simple_Titlecase_Mapping">Simple_Titlecase_Mapping</a></td>
- <td><a href="#Sentence_Break">Sentence_Break</a></td>
- <th>Contributory Properties</th>
- </tr>
- <tr>
- <td><a href="#Simple_Uppercase_Mapping">Simple_Uppercase_Mapping</a></td>
- <td><a href="#Word_Break">Word_Break</a></td>
- <td class="lightgray"><a href="#Other_Alphabetic">Other_Alphabetic</a></td>
- </tr>
- <tr>
- <td><a href="#Simple_Case_Folding">Simple_Case_Folding</a></td>
- <td><a href="#East_Asian_Width">East_Asian_Width</a></td>
- <td class="lightgray"><a href="#Other_Default_Ignorable_Code_Point">Other_Default_Ignorable_Code_Point</a></td>
- </tr>
- <tr>
- <td><a href="#Soft_Dotted">Soft_Dotted</a></td>
- <td><a href="#Prepended_Concatenation_Mark">Prepended_Concatenation_Mark</a></td>
- <td class="lightgray"><a href="#Other_Grapheme_Extend">Other_Grapheme_Extend</a></td>
- </tr>
- <tr>
- <td><a href="#Cased">Cased</a></td>
- <th>Bidirectional</th>
- <td class="lightgray"><a href="#Other_ID_Start">Other_ID_Start</a></td>
- </tr>
- <tr>
- <td><a href="#Case_Ignorable">Case_Ignorable</a></td>
- <td><a href="#Bidi_Class">Bidi_Class</a></td>
- <td class="lightgray"><a href="#Other_ID_Continue">Other_ID_Continue</a></td>
- </tr>
- <tr>
- <td><a href="#CWL">Changes_When_Lowercased</a></td>
- <td><a href="#Bidi_Control">Bidi_Control</a></td>
- <td class="lightgray"><a href="#Other_Lowercase">Other_Lowercase</a></td>
- </tr>
- <tr>
- <td><a href="#CWU">Changes_When_Uppercased</a></td>
- <td><a href="#Bidi_Mirrored">Bidi_Mirrored</a></td>
- <td class="lightgray"><a href="#Other_Math">Other_Math</a></td>
- </tr>
- <tr>
- <td><a href="#CWT">Changes_When_Titlecased</a></td>
- <td><a href="#Bidi_Mirroring_Glyph">Bidi_Mirroring_Glyph</a></td>
- <td class="lightgray"><a href="#Other_Uppercase">Other_Uppercase</a></td>
- </tr>
- <tr>
- <td><a href="#CWCF">Changes_When_Casefolded</a></td>
- <td><a href="#Bidi_Paired_Bracket">Bidi_Paired_Bracket</a></td>
- <td class="lightgray"><a href="#Jamo_Short_Name">Jamo_Short_Name</a></td>
- </tr>
- <tr>
- <td><a href="#CWCM">Changes_When_Casemapped</a></td>
- <td><a href="#Bidi_Paired_Bracket_Type">Bidi_Paired_Bracket_Type</a></td>
- <td> </td>
- </tr>
- <tr>
- <th>Numeric</th>
- <th>Identifiers</th>
- <td> </td>
- </tr>
- <tr>
- <td><a href="#Numeric_Value">Numeric_Value</a></td>
- <td><a href="#ID_Continue">ID_Continue</a></td>
- <td> </td>
- </tr>
- <tr>
- <td><a href="#Numeric_Type">Numeric_Type</a></td>
- <td><a href="#ID_Start">ID_Start</a></td>
- <td> </td>
- </tr>
- <tr>
- <td><a href="#Hex_Digit">Hex_Digit</a></td>
- <td><a href="#XID_Continue">XID_Continue</a></td>
- <td> </td>
- </tr>
- <tr>
- <td><a href="#ASCII_Hex_Digit">ASCII_Hex_Digit</a></td>
- <td><a href="#XID_Start">XID_Start</a></td>
- <td> </td>
- </tr>
- <tr>
- <td> </td>
- <td><a href="#Pattern_Syntax">Pattern_Syntax</a></td>
- <td> </td>
- </tr>
- <tr>
- <td> </td>
- <td><a href="#Pattern_White_Space">Pattern_White_Space</a></td>
- <td> </td>
- </tr>
- </table>
- </div>
- <p> </p>
- <h3>5.2 <a name="About_Property_Table" href="#About_Property_Table">About the Property Table</a></h3>
- <p><i>Table 9, <a href="#Property_List_Table">Property Table</a></i>
- specifies the list of character properties
- defined in the UCD.
- That table is divided into separate sections for each data
- file in the UCD. Data files which define a single property or a small number of properties are listed
- first, followed by the data files which define a
- large number of properties: <a href="#DerivedCoreProperties.txt">DerivedCoreProperties.txt</a>,
- <a href="#DerivedNormalizationProps.txt">DerivedNormalizationProps.txt</a>,
- <a href="#PropList.txt">PropList.txt</a>, and <a href="#UnicodeData.txt">UnicodeData.txt</a>.
- In some instances for these files defining many properties, the
- entries in the property table are grouped by type, for clarity in presentation, rather than
- being listed alphabetically.</p>
-
- <p>In <i>Table 9,
- <a href="#Property_List_Table">Property Table</a></i> each property is described as follows:</p>
-
- <p><b>First Column.</b> This column contains the name of each of the character properties
- specified in the respective data file.
- Any special status for a property, such
- as whether it is <a href="#Obsolete_Properties">obsolete</a>,
- <a href="#Deprecated_Properties">deprecated</a>, or
- <a href="#Stabilized_Properties">stabilized</a>, is also indicated in
- the first column.</p>
- <p><b>Second Column.</b> This column
- indicates the type of the property, according to the
- key in <i>Table 8</i>.</p>
-
- <p class="caption">Table 8. <a name="Type_Key_Table" href="#Type_Key_Table">Property Type Key</a></p>
- <div align="center">
- <table class="simple">
- <tr>
- <th>Property Type</th>
- <th>Symbol</th>
- <th>Examples</th>
- </tr>
- <tr>
- <td>Catalog</td>
- <td style="text-align:center">C</td>
- <td>Age, Block</td>
- </tr>
- <tr>
- <td>Enumeration</td>
- <td style="text-align:center">E</td>
- <td>Joining_Type, Line_Break</td>
- </tr>
- <tr>
- <td>Binary</td>
- <td style="text-align:center">B</td>
- <td>Uppercase, White_Space</td>
- </tr>
- <tr>
- <td>String</td>
- <td style="text-align:center">S</td>
- <td>Uppercase_Mapping, Case_Folding</td>
- </tr>
- <tr>
- <td>Numeric</td>
- <td style="text-align:center">N</td>
- <td>Numeric_Value</td>
- </tr>
- <tr>
- <td>Miscellaneous</td>
- <td style="text-align:center">M</td>
- <td>Name, Jamo_Short_Name</td>
- </tr>
- </table>
- </div>
- <ul>
- <li><b>Catalog</b> properties have enumerated values which are expected
- to be regularly extended in successive versions of the Unicode Standard. This distinguishes them
- from Enumeration properties.</li>
- <li><b>Enumeration</b> properties have enumerated values
- which constitute a logical partition space;
- new values will generally not be added to them in successive versions of the standard.</li>
- <li><b>Binary</b> properties are a special case of Enumeration properties, which
- have exactly two values: Yes and No (or True and False).</li>
- <li><b>String</b> properties
- are typically mappings from a Unicode code point to another Unicode code point
- or sequence of Unicode code points; examples include case mappings and
- decomposition mappings.</li>
- <li><b>Numeric</b> properties specify the actual numeric values
- for digits and other characters associated with numbers in some way.</li>
- <li><b>Miscellaneous</b> properties are those properties that do not fit neatly into the other
- property categories; they currently include character names, comments about characters,
- the <a href="#Script_Extensions">Script_Extensions</a> property,
- and the Unicode_Radical_Stroke property (a combination of numeric values)
- documented in Unicode Standard Annex #38, "Unicode Han Database (Unihan)"
- [<a href="../tr41/tr41-21.html#UAX38">UAX38</a>].</li>
- </ul>
-
- <p><b>Third Column.</b> This column indicates the
- status of the property: <b>N</b>ormative or <b>I</b>nformative or <b>C</b>ontributory
- or <b>P</b>rovisional.</p>
-
- <p><b>Fourth Column.</b> This column provides a description of
- the property or properties. This includes information on derivation for
- derived properties, as well as references to locations in the standard
- where the property is defined or discussed in detail.</p>
-
- <p>In the section of the table for <a href="#UnicodeData.txt">UnicodeData.txt</a>,
- the data field numbers are also supplied in parentheses at the
- start of the description.</p>
-
- <p>For a few entries in the property table, values specified in the fields in a
- data file only contribute to a full definition of a Unicode character property.
- For example, the values in field 1 (Name) in
- UnicodeData.txt do not provide all the values for the Name
- property for all code points; <a href="#Jamo.txt">Jamo.txt</a> must also be used,
- and the Name property for CJK unified ideographs, Tangut ideographs,
- and Nushu ideographs is derived by rule.</p>
-
- <p>None of the Unicode character properties should be used simply on the
- basis of the descriptions in the property table without consulting the relevant
- discussions in the Unicode Standard. Because of the enormous variety of
- characters in the repertoire of the Unicode Standard, character properties
- tend not to be self-evident in application, even when the names of the
- properties may seem familiar from their usage with much smaller legacy
- character encodings.</p>
- <h3>5.3 <a name="Property_Definitions" href="#Property_Definitions">Property Definitions</a></h3>
- <p>This section contains the table which describes each character property and defines its status, organized by data file in the UCD.
- <i>Table 9</i> provides general descriptions of the Unicode character properties, their derivations,
- and/or their usage, as well as pointers to the respective parts of the standard where formal property definitions or additional
- information about the properties can be found. The property status column and any formal statement of the derivation
- of derived properties are definitive; however, <i>Table 9</i> does not provide formal definitions of the other properties
- and should not be interpreted as such. For details on the columns and overall organization of the table, see
- Section 5.2 <a href="#About_Property_Table">About the Property Table</a>.</p>
- <p class="caption">Table 9. <a name="Property_List_Table" href="#Property_List_Table">Property Table</a></p>
- <table class="simple">
- <tr>
- <th valign="top" align="LEFT" colspan="4">
- <a name="ArabicShaping.txt" href="#ArabicShaping.txt">ArabicShaping.txt</a></th>
- </tr>
- <tr>
- <td><a name="Joining_Type" href="#Joining_Type">Joining_Type</a><br>
- <a name="Joining_Group" href="#Joining_Group">Joining_Group</a></td>
- <td>E</td>
- <td valign="top">N</td>
- <td>Basic Arabic and Syriac character shaping properties, such as initial, medial and final
- shapes. See <i>Section 9.2, Arabic</i> in [<a href="../tr41/tr41-21.html#Unicode">Unicode</a>].
- </td>
- </tr>
-
- <tr>
- <th valign="top" align="LEFT" colspan="4">
- <a name="BidiBrackets.txt" href="#BidiBrackets.txt">BidiBrackets.txt</a></th>
- </tr>
- <tr>
- <td><a name="Bidi_Paired_Bracket_Type" href="#Bidi_Paired_Bracket_Type">Bidi_Paired_Bracket_Type</a></td>
- <td>E</td>
- <td valign="top">N</td>
- <td>Type of a paired bracket, either opening or closing. This property is used in the implementation
- of parenthesis matching.
- See Unicode Standard Annex #9, "Unicode Bidirectional Algorithm" [<a href="../tr41/tr41-21.html#UAX9">UAX9</a>].</td>
- </tr>
- <tr>
- <td><a name="Bidi_Paired_Bracket" href="#Bidi_Paired_Bracket">Bidi_Paired_Bracket</a></td>
- <td>M</td>
- <td valign="top">N</td>
- <td>For an opening bracket, the code point of the matching closing bracket. For a closing bracket, the
- code point of the matching opening bracket. This property is used in the implementation
- of parenthesis matching.
- See Unicode Standard Annex #9, "Unicode Bidirectional Algorithm" [<a href="../tr41/tr41-21.html#UAX9">UAX9</a>].</td>
- </tr>
-
- <tr>
- <th valign="top" align="LEFT" colspan="4">
- <a name="BidiMirroring.txt" href="#BidiMirroring.txt">BidiMirroring.txt</a></th>
- </tr>
- <tr>
- <td><a name="Bidi_Mirroring_Glyph" href="#Bidi_Mirroring_Glyph">Bidi_Mirroring_Glyph</a></td>
- <td>M</td>
- <td valign="top">I</td>
- <td>Informative mapping for substituting characters in an implementation of bidirectional mirroring.
- This maps a subset of characters with the Bidi_Mirrored property to other
- characters that normally are displayed with the corresponding mirrored glyph.
- When a character with the Bidi_Mirrored property has
- the default value for Bidi_Mirroring_Glyph, that means that no other character
- exists whose glyph is appropriate for character-based glyph mirroring.
- Implementations must then use other mechanisms to implement mirroring of those
- characters for the Unicode Bidirectional Algorithm.
- See Unicode Standard Annex #9, "Unicode Bidirectional Algorithm" [<a href="../tr41/tr41-21.html#UAX9">UAX9</a>]. Do not
- confuse this property with the <a href="#Bidi_Mirrored">Bidi_Mirrored</a> property itself.</td>
- </tr>
-
- <tr>
- <th valign="top" align="LEFT" colspan="4">
- <a name="Blocks.txt" href="#Blocks.txt">Blocks.txt</a></th>
- </tr>
- <tr>
- <td><a name="Block" href="#Block">Block</a></td>
- <td>C</td>
- <td valign="top">N</td>
- <td>Blocks.txt specifies the Block property, which consists
- of the list of block names
- for ranges of code points. See
- D10b in <i>Section 3.4, Characters and Encoding</i>, of
- [<a href="../tr41/tr41-21.html#Unicode">Unicode</a>]. See also
- the code charts in [<a href="../tr41/tr41-21.html#Unicode">Unicode</a>].</td>
- </tr>
-
- <tr>
- <th valign="top" align="LEFT" colspan="4">
- <a name="CompositionExclusions.txt" href="#CompositionExclusions.txt">CompositionExclusions.txt</a></th>
- </tr>
- <tr>
- <td><a name="Composition_Exclusion" href="#Composition_Exclusion">Composition_Exclusion</a></td>
- <td>B</td>
- <td valign="top">N</td>
- <td>
- A property used in normalization. See Unicode Standard Annex #15, "Unicode Normalization Forms" [<a href="../tr41/tr41-21.html#UAX15">UAX15</a>].
- Unlike other files, CompositionExclusions.txt simply lists the relevant code points.</td>
- </tr>
-
- <tr>
- <th valign="top" align="LEFT" colspan="4">
- <a name="CaseFolding.txt" href="#CaseFolding.txt">CaseFolding.txt</a></th>
- </tr>
- <tr>
- <td><a name="Simple_Case_Folding" href="#Simple_Case_Folding">Simple_Case_Folding</a><br>
- <a name="Case_Folding" href="#Case_Folding">Case_Folding</a></td>
- <td>S</td>
- <td valign="top">N</td>
- <td>Mapping from characters to their case-folded forms. This is an informative file containing
- normative derived properties.
- <p><i>Derived from UnicodeData and SpecialCasing.</i>
- <p><b>Note: </b>The case foldings are omitted in the data file if they are
- the same as the code point itself.</td>
- </tr>
-
- <tr>
- <th valign="top" align="LEFT" colspan="4">
- <a name="DerivedAge.txt" href="#DerivedAge.txt">DerivedAge.txt</a></th>
- </tr>
- <tr>
- <td><a name="Age" href="#Age">Age</a></td>
- <td>C</td>
- <td valign="top">N</td>
- <td>A property defining when various code points were designated/assigned in successive versions
- of the Unicode Standard.
- For a detailed discussion of the Age property, see
- Section 5.14, <a href="#Character_Age"><i>Character Age</i></a>.
- </td>
- </tr>
-
- <tr>
- <th valign="top" align="LEFT" colspan="4">
- <a name="EastAsianWidth.txt" href="#EastAsianWidth.txt">EastAsianWidth.txt</a></th>
- </tr>
- <tr>
- <td><a name="East_Asian_Width" href="#East_Asian_Width">East_Asian_Width</a></td>
- <td>E</td>
- <td valign="top">I</td>
- <td>A property
- for determining the choice of wide versus narrow glyphs in East Asian contexts.
- Property values are described in Unicode Standard Annex #11, "East Asian Width" [<a href="../tr41/tr41-21.html#UAX11">UAX11</a>].</td>
- </tr>
-
- <tr>
- <th valign="top" align="LEFT" colspan="4">
- <a name="HangulSyllableType.txt" href="#HangulSyllableType.txt">HangulSyllableType.txt</a></th>
- </tr>
- <tr>
- <td valign="top"><a name="Hangul_Syllable_Type" href="#Hangul_Syllable_Type">Hangul_Syllable_Type</a></td>
- <td valign="top" align="center">E</td>
- <td valign="top" align="center">N</td>
- <td valign="top">The values L, V, T, LV, and LVT used in <i>Chapter 3, Conformance</i> in [<a href="../tr41/tr41-21.html#Unicode">Unicode</a>].</td>
- </tr>
-
- <tr>
- <th valign="top" align="LEFT" colspan="4">
- <a name="IndicPositionalCategory.txt" href="#IndicPositionalCategory.txt">IndicPositionalCategory.txt</a></th>
- </tr>
- <tr>
- <td valign="top"><a name="Indic_Matra_Category"></a>
- <a name="Indic_Positional_Category" href="#Indic_Positional_Category">Indic_Positional_Category</a></td>
- <td valign="top" align="center">E</td>
- <td valign="top" align="center">I</td>
- <td valign="top">A property informally defining the
- positional categories
- for dependent vowels, viramas, combining marks, and other characters used in Indic scripts.
- General descriptions of the property values are provided in the header section
- of the data file IndicPositionalCategory.txt.</td>
- </tr>
-
- <tr>
- <th valign="top" align="LEFT" colspan="4">
- <a name="IndicSyllabicCategory.txt" href="#IndicSyllabicCategory.txt">IndicSyllabicCategory.txt</a></th>
- </tr>
- <tr>
- <td valign="top"><a name="Indic_Syllabic_Category" href="#Indic_Syllabic_Category">Indic_Syllabic_Category</a></td>
- <td valign="top" align="center">E</td>
- <td valign="top" align="center">I</td>
- <td valign="top">A property informally defining the structural categories
- of syllabic components in Indic scripts.
- General descriptions of the property values are provided in the header section
- of the data file IndicSyllabicCategory.txt.</td>
- </tr>
-
- <tr>
- <th valign="top" align="LEFT" colspan="4">
- <a name="Jamo.txt" href="#Jamo.txt">Jamo.txt</a></th>
- </tr>
- <tr>
- <td valign="top"><a name="Jamo_Short_Name" href="#Jamo_Short_Name">Jamo_Short_Name</a></td>
- <td valign="top" align="center">M</td>
- <td valign="top" align="center">C</td>
- <td valign="top">The Hangul Syllable names are derived from the Jamo Short
- Names, as described in <i>Chapter 3, Conformance</i> in [<a href="../tr41/tr41-21.html#Unicode">Unicode</a>].</td>
- </tr>
-
- <tr>
- <th valign="top" align="LEFT" colspan="4">
- <a name="LineBreak.txt" href="#LineBreak.txt">LineBreak.txt</a></th>
- </tr>
- <tr>
- <td><a name="Line_Break" href="#Line_Break">Line_Break</a></td>
- <td>E</td>
- <td valign="top">N</td>
- <td>A property
- for line breaking. For more information, see Unicode Standard Annex #14, "Unicode Line Breaking
- Algorithm" [<a href="../tr41/tr41-21.html#UAX14">UAX14</a>].</td>
- </tr>
-
- <tr>
- <th valign="top" align="LEFT" colspan="4">
- <a name="GraphemeBreakProperty.txt" href="#GraphemeBreakProperty.txt">GraphemeBreakProperty.txt</a></th>
- </tr>
- <tr>
- <td><a name="Grapheme_Cluster_Break" href="#Grapheme_Cluster_Break">Grapheme_Cluster_Break</a></td>
- <td>E</td>
- <td valign="top">I</td>
- <td>See Unicode Standard Annex #29, "Unicode Text Segmentation" [<a href="../tr41/tr41-21.html#UAX29">UAX29</a>]</td>
- </tr>
-
- <tr>
- <th valign="top" align="LEFT" colspan="4">
- <a name="SentenceBreakProperty.txt" href="#SentenceBreakProperty.txt">SentenceBreakProperty.txt</a></th>
- </tr>
- <tr>
- <td><a name="Sentence_Break" href="#Sentence_Break">Sentence_Break</a></td>
- <td>E</td>
- <td valign="top">I</td>
- <td>See Unicode Standard Annex #29, "Unicode Text Segmentation" [<a href="../tr41/tr41-21.html#UAX29">UAX29</a>]</td>
- </tr>
-
- <tr>
- <th valign="top" align="LEFT" colspan="4">
- <a name="WordBreakProperty.txt" href="#WordBreakProperty.txt">WordBreakProperty.txt</a></th>
- </tr>
- <tr>
- <td><a name="Word_Break" href="#Word_Break">Word_Break</a></td>
- <td>E</td>
- <td valign="top">I</td>
- <td>See Unicode Standard Annex #29, "Unicode Text Segmentation" [<a href="../tr41/tr41-21.html#UAX29">UAX29</a>]</td>
- </tr>
-
- <tr>
- <th valign="top" align="LEFT" colspan="4">
- <a name="NameAliases.txt" href="#NameAliases.txt">NameAliases.txt</a></th>
- </tr>
- <tr>
- <td valign="top"><a name="Name_Alias" href="#Name_Alias">Name_Alias</a></td>
- <td valign="top" align="center">M</td>
- <td valign="top" align="center">N</td>
- <td valign="top">Normative formal aliases for characters with erroneous
- names, for control characters and some format characters,
- and for character abbreviations, as described in <i>Chapter 4, Character Properties</i> in [<a href="../tr41/tr41-21.html#Unicode">Unicode</a>].
- Aliases tagged with the type "correction", as well as a selection of aliases of other types, are
- published in the Unicode Standard code charts.</td>
- </tr>
-
- <tr>
- <th valign="top" align="LEFT" colspan="4">
- <a name="NormalizationCorrections.txt" href="#NormalizationCorrections.txt">NormalizationCorrections.txt</a></th>
- </tr>
- <tr>
- <td valign="top"><i>used in Decomposition Mappings</i></td>
- <td valign="top" align="center">S</td>
- <td valign="top" align="center">N</td>
- <td valign="top">NormalizationCorrections lists code point differences for <i>
- <a href="http://www.unicode.org/versions/corrigenda.html">Normalization Corrigenda</a>. </i>
- For more information, see Unicode Standard Annex #15, "Unicode Normalization Forms"
- [<a href="../tr41/tr41-21.html#UAX15">UAX15</a>].</td>
- </tr>
-
- <tr>
- <th valign="top" align="LEFT" colspan="4">
- <a name="Scripts.txt" href="#Scripts.txt">Scripts.txt</a></th>
- </tr>
- <tr>
- <td><a name="Script" href="#Script">Script</a></td>
- <td>C</td>
- <td valign="top">I</td>
- <td>Script values for use in regular expressions and elsewhere.
- For more information, see Unicode Standard Annex
- #24, "Unicode Script Property" [<a href="../tr41/tr41-21.html#UAX24">UAX24</a>].</td>
- </tr>
-
- <tr>
- <th valign="top" align="LEFT" colspan="4">
- <a name="ScriptExtensions.txt" href="#ScriptExtensions.txt">ScriptExtensions.txt</a></th>
- </tr>
- <tr>
- <td><a name="Script_Extensions" href="#Script_Extensions">Script_Extensions</a></td>
- <td>M</td>
- <td valign="top">I</td>
- <td>Enumerated sets of Script values for use in regular expressions and elsewhere.
- For more information, see Unicode Standard Annex
- #24, "Unicode Script Property" [<a href="../tr41/tr41-21.html#UAX24">UAX24</a>].</td>
- </tr>
-
- <tr>
- <th valign="top" align="LEFT" colspan="4">
- <a name="SpecialCasing.txt" href="#SpecialCasing.txt">SpecialCasing.txt</a></th>
- </tr>
- <tr>
- <td><a name="Uppercase_Mapping" href="#Uppercase_Mapping">Uppercase_Mapping<br>
- </a><a name="Lowercase_Mapping" href="#Lowercase_Mapping">Lowercase_Mapping</a><br>
- <a name="Titlecase_Mapping" href="#Titlecase_Mapping">Titlecase_Mapping</a><br>
- </td>
- <td>S</td>
- <td valign="top">I</td>
- <td>Data for producing (in combination with the simple case mappings
- from <a href="#UnicodeData.txt">UnicodeData.txt</a>) the full case mappings.</td>
- </tr>
-
- <tr>
- <th valign="top" align="LEFT" colspan="4">
- <a name="Unihan.txt" href="#Unihan.txt">Unihan</a> data files (for more
- information, see [<a href="../tr41/tr41-21.html#UAX38">UAX38</a>])</th>
- </tr>
- <tr>
- <td><a name="Numeric_Type_Han" href="#Numeric_Type_Han">Numeric_Type</a><br>
- <a name="Numeric_Value_Han" href="#Numeric_Value_Han">Numeric_Value</a></td>
- <td>E</td>
- <td valign="top">I</td>
- <td>The characters tagged with either kPrimaryNumeric,
- kAccountingNumeric, or kOtherNumeric are given the property value
- Numeric_Type=Numeric, and the Numeric_Value indicated
- in those tags.
- <p>Most characters have these numeric properties based on values from UnicodeData.txt.
- See <a href="#Numeric_Type">Numeric_Type</a>.</td>
- </tr>
- <tr>
- <td><a name="Unicode_Radical_Stroke" href="#Unicode_Radical_Stroke">Unicode_Radical_Stroke</a></td>
- <td>M</td>
- <td valign="top">I</td>
- <td>The Unicode radical-stroke count, based on the tag
- kRSUnicode.</td>
- </tr>
-
- <tr>
- <th valign="top" align="LEFT" colspan="4">
- <a name="DerivedCoreProperties.txt" href="#DerivedCoreProperties.txt">DerivedCoreProperties.txt</a></th>
- </tr>
- <tr>
- <td valign="top" align="left"><a name="Lowercase" href="#Lowercase">Lowercase</a></td>
- <td valign="top">B</td>
- <td valign="top">I</td>
- <td valign="top">Characters with the Lowercase property. For more information, see
- <i>Chapter 4, Character Properties</i> in [<a href="../tr41/tr41-21.html#Unicode">Unicode</a>].<p><i>Generated from: Ll + <a href="#Other_Lowercase">Other_Lowercase</a></i></td>
- </tr>
- <tr>
- <td valign="top" align="left"><a name="Uppercase" href="#Uppercase">Uppercase</a></td>
- <td valign="top">B</td>
- <td valign="top">I</td>
- <td valign="top">Characters with the Uppercase property. For more information, see
- <i>Chapter 4, Character Properties</i> in [<a href="../tr41/tr41-21.html#Unicode">Unicode</a>].<p><i>Generated from: Lu + <a href="#Other_Uppercase">Other_Uppercase</a></i></td>
- </tr>
- <tr>
- <td valign="top" align="left"><a name="Cased" href="#Cased">Cased</a></td>
- <td valign="top">B</td>
- <td valign="top">I</td>
- <td valign="top">Characters which are considered to be either uppercase, lowercase
- or titlecase characters. This property is not identical to the
- Changes_When_Casemapped property. For more information, see D135 in <i>Section 3.13, Default Case
- Algorithms</i> in [<a href="../tr41/tr41-21.html#Unicode">Unicode</a>].
- <p><i>Generated from: <a href="#Lowercase">Lowercase</a> + <a href="#Uppercase">Uppercase</a> + Lt</i></td>
- </tr>
- <tr>
- <td valign="top" align="left"><a name="Case_Ignorable" href="#Case_Ignorable">Case_Ignorable</a></td>
- <td valign="top">B</td>
- <td valign="top">I</td>
- <td valign="top">Characters which are ignored for casing purposes. For more
- information, see D136 in <i>Section 3.13, Default Case
- Algorithms</i> in [<a href="../tr41/tr41-21.html#Unicode">Unicode</a>].
- <p><i>Generated from: Mn + Me + Cf + Lm + Sk + <a href="#Word_Break">Word_Break</a>=MidLetter +
- <a href="#Word_Break">Word_Break</a>=MidNumLet + <a href="#Word_Break">Word_Break</a>=Single_Quote</i></td>
- </tr>
- <tr>
- <td valign="top" align="left"><a name="CWL" href="#CWL">Changes_When_Lowercased</a></td>
- <td valign="top">B</td>
- <td valign="top">I</td>
- <td valign="top">Characters whose normalized forms are not stable under a toLowercase
- mapping. For more information, see D139 in <i>Section 3.13, Default Case
- Algorithms</i> in [<a href="../tr41/tr41-21.html#Unicode">Unicode</a>].
- <p><i>Generated from: toLowercase(toNFD(X)) != toNFD(X)</i></td>
- </tr>
- <tr>
- <td valign="top" align="left"><a name="CWU" href="#CWU">Changes_When_Uppercased</a></td>
- <td valign="top">B</td>
- <td valign="top">I</td>
- <td valign="top">Characters whose normalized forms are not stable under a toUppercase
- mapping. For more information, see D140 in <i>Section 3.13, Default Case
- Algorithms</i> in [<a href="../tr41/tr41-21.html#Unicode">Unicode</a>].
- <p><i>Generated from: toUppercase(toNFD(X)) != toNFD(X)</i></td>
- </tr>
- <tr>
- <td valign="top" align="left"><a name="CWT" href="#CWT">Changes_When_Titlecased</a></td>
- <td valign="top">B</td>
- <td valign="top">I</td>
- <td valign="top">Characters whose normalized forms are not stable under a toTitlecase
- mapping. For more information, see D141 in <i>Section 3.13, Default Case
- Algorithms</i> in [<a href="../tr41/tr41-21.html#Unicode">Unicode</a>].
- <p><i>Generated from: toTitlecase(toNFD(X)) != toNFD(X)</i></td>
- </tr>
- <tr>
- <td valign="top" align="left"><a name="CWCF" href="#CWCF">Changes_When_Casefolded</a></td>
- <td valign="top">B</td>
- <td valign="top">I</td>
- <td valign="top">Characters whose normalized forms are not stable under case
- folding. For more information, see D142 in <i>Section 3.13, Default Case
- Algorithms</i> in [<a href="../tr41/tr41-21.html#Unicode">Unicode</a>].
- <p><i>Generated from: toCasefold(toNFD(X)) != toNFD(X)</i></td>
- </tr>
- <tr>
- <td valign="top" align="left"><a name="CWCM" href="#CWCM">Changes_When_Casemapped</a></td>
- <td valign="top">B</td>
- <td valign="top">I</td>
- <td valign="top">Characters which may change when they undergo case mapping.
- For more information, see D143 in <i>Section 3.13, Default Case
- Algorithms</i> in [<a href="../tr41/tr41-21.html#Unicode">Unicode</a>].
- <p><i>Generated from: Changes_When_Lowercased(X) or Changes_When_Uppercased(X) or
- Changes_When_Titlecased(X)</i></td>
- </tr>
- <tr>
- <td valign="top" align="left"><a name="Alphabetic" href="#Alphabetic">Alphabetic</a></td>
- <td valign="top">B</td>
- <td valign="top">I</td>
- <td valign="top">Characters with the Alphabetic property. For more information, see
- <i>Chapter 4, Character Properties</i> in [<a href="../tr41/tr41-21.html#Unicode">Unicode</a>].
- <p><i>Generated from:
- <a href="#Lowercase">Lowercase</a> + <a href="#Uppercase">Uppercase</a> + Lt + Lm +
- Lo + Nl + <a href="#Other_Alphabetic">Other_Alphabetic</a></i></td>
- </tr>
- <tr>
- <td valign="top" align="left"><a name="Default_Ignorable_Code_Point" href="#Default_Ignorable_Code_Point">
- Default_Ignorable_Code_Point</a></td>
- <td valign="top">B</td>
- <td valign="top">N</td>
- <td valign="top">For programmatic determination of default ignorable code points. New
- characters that should be ignored in rendering (unless explicitly supported) will be assigned
- in these ranges, permitting programs to correctly handle the default rendering of such
- characters when not otherwise supported. For more information, see the FAQ
- <a href="http://www.unicode.org/faq/unsup_char.html">Display of Unsupported Characters</a>,
- and <i>Section 5.21, Ignoring Characters in Processing</i>
- in [<a href="../tr41/tr41-21.html#Unicode">Unicode</a>].
-
- <p><i>Generated from<br>
- <a href="#Other_Default_Ignorable_Code_Point">Other_Default_Ignorable_Code_Point</a><br>
- + Cf (format characters)<br>
- + Variation_Selector<br>
- - White_Space<br>
- - FFF9..FFFB (annotation characters)<br>
- - 0600..0605, 06DD, 070F, 08E2, 110BD (exceptional Cf characters that should be visible)</i></td>
- </tr>
- <tr>
- <td valign="top" align="left"><a name="Grapheme_Base" href="#Grapheme_Base">Grapheme_Base</a></td>
- <td valign="top">B</td>
- <td valign="top">N</td>
- <td valign="top">Property used together with the definition of Standard Korean Syllable
- Block to define "Grapheme base". See D58 in <i>Chapter 3, Conformance</i> in [<a href="../tr41/tr41-21.html#Unicode">Unicode</a>].
- <p><i>Generated from: [0..10FFFF] - Cc - Cf - Cs - Co - Cn - Zl - Zp -
- <a href="#Grapheme_Extend">Grapheme_Extend</a></i>
- <p><b>Note:</b> Grapheme_Base is a property of individual characters. That usage contrasts
- with "grapheme base", which is an attribute of Unicode strings; a grapheme base may consist
- of a Korean syllable which is itself represented by a sequence of conjoining jamos.</td>
- </tr>
- <tr>
- <td valign="top" align="left"><a name="Grapheme_Extend" href="#Grapheme_Extend">Grapheme_Extend</a></td>
- <td valign="top">B</td>
- <td valign="top">N</td>
- <td valign="top">Property used
- to define "Grapheme extender". See D59 in <i>Chapter 3, Conformance</i> in [<a href="../tr41/tr41-21.html#Unicode">Unicode</a>].
- <p><i>Generated from: Me + Mn + <a href="#Other_Grapheme_Extend">Other_Grapheme_Extend</a></i></p>
- <p><b>Note:</b> The set of characters for which Grapheme_Extend=Yes is equivalent to
- the set of characters for which Grapheme_Cluster_Break=Extend.</td>
- </tr>
- <tr>
- <td valign="top" align="left"><a name="Grapheme_Link" href="#Grapheme_Link">Grapheme_Link</a>
- (<a href="#Deprecated_Properties">Deprecated</a> as of 5.0.0)</td>
- <td valign="top">B</td>
- <td valign="top">I</td>
- <td valign="top">Formerly proposed for programmatic determination of grapheme cluster boundaries.
- <p><i>Generated from: Canonical_Combining_Class=Virama</i></td>
- </tr>
- <tr>
- <td valign="top" align="left"><a name="Math" href="#Math">Math</a></td>
- <td valign="top">B</td>
- <td valign="top">I</td>
- <td valign="top">Characters with the Math property. For more information, see
- <i>Chapter 4, Character Properties</i> in [<a href="../tr41/tr41-21.html#Unicode">Unicode</a>].<p><i>Generated from: Sm + <a href="#Other_Math">Other_Math</a></i></td>
- </tr>
- <tr>
- <td valign="top" align="left"><a name="ID_Start" href="#ID_Start">ID_Start</a></td>
- <td valign="top">B</td>
- <td valign="top">I</td>
- <td valign="top" rowspan="4">Used to determine programming identifiers, as described
- in Unicode Standard Annex #31, "Unicode Identifier and Pattern Syntax" [<a href="../tr41/tr41-21.html#UAX31">UAX31</a>].</td>
- </tr>
- <tr>
- <td valign="top" align="left"><a name="ID_Continue" href="#ID_Continue">ID_Continue</a></td>
- <td valign="top">B</td>
- <td valign="top">I</td>
- </tr>
- <tr>
- <td valign="top" align="left"><a name="XID_Start" href="#XID_Start">XID_Start</a></td>
- <td valign="top">B</td>
- <td valign="top">I</td>
- </tr>
- <tr>
- <td valign="top" align="left"><a name="XID_Continue" href="#XID_Continue">XID_Continue</a></td>
- <td valign="top">B</td>
- <td valign="top">I</td>
- </tr>
-
- <tr>
- <th valign="top" align="LEFT" colspan="4">
- <a name="DerivedNormalizationProps.txt" href="#DerivedNormalizationProps.txt">DerivedNormalizationProps.txt</a></th>
- </tr>
- <tr>
- <td valign="top" align="left"><a name="Full_Composition_Exclusion" href="#Full_Composition_Exclusion">Full_Composition_Exclusion</a></td>
- <td valign="top">B</td>
- <td valign="top">N</td>
- <td valign="top">Characters that are excluded from composition: those listed explicitly in
- CompositionExclusions.txt, plus the derivable sets of
- <i>Singleton Decompositions</i> and
- <i>Non-Starter Decompositions</i>, as documented in that data file.</td>
- </tr>
- <tr>
- <td valign="top" align="left"><a name="Expands_On_NFC" href="#Expands_On_NFC">Expands_On_NFC</a><br>
- <a name="Expands_On_NFD" href="#Expands_On_NFD">Expands_On_NFD</a><br>
- <a name="Expands_On_NFKC" href="#Expands_On_NFKC">Expands_On_NFKC</a><br>
- <a name="Expands_On_NFKD" href="#Expands_On_NFKD">Expands_On_NFKD</a><br>
- (<a href="#Deprecated_Properties">Deprecated</a> as of 6.0.0)</td>
- <td valign="top">B</td>
- <td valign="top">N</td>
- <td valign="top">Characters that expand to more than one character in the specified
- normalization form.</td>
- </tr>
- <tr>
- <td valign="top" align="left"><a name="FC_NFKC_Closure" href="#FC_NFKC_Closure">FC_NFKC_Closure</a><br>
- (<a href="#Deprecated_Properties">Deprecated</a> as of 6.0.0)</td>
- <td valign="top">S</td>
- <td valign="top">N</td>
- <td valign="top">Characters that require extra mappings for closure under Case Folding plus
- Normalization Form KC.
- <p>The mapping is listed in Field 2.</p>
- </td>
- </tr>
- <tr>
- <td valign="top" align="left"><a name="NFD_Quick_Check" href="#NFD_Quick_Check">NFD_Quick_Check</a><br>
- <a name="NFKD_Quick_Check" href="#NFKD_Quick_Check">NFKD_Quick_Check</a><br>
- <a name="NFC_Quick_Check" href="#NFC_Quick_Check">NFC_Quick_Check</a><br>
- <a name="NFKC_Quick_Check" href="#NFKC_Quick_Check">NFKC_Quick_Check</a></td>
- <td valign="top">E</td>
- <td valign="top">N</td>
- <td valign="top">For property values, see <a href="#Decompositions_and_Normalization">
- Decompositions and Normalization</a>. (Abbreviated names: NFD_QC, NFKD_QC, NFC_QC, NFKC_QC)</td>
- </tr>
- <tr>
- <td valign="top" align="left"><a name="NFKC_Casefold" href="#NFKC_Casefold">NFKC_Casefold</a></td>
- <td valign="top">S</td>
- <td valign="top">I</td>
- <td valign="top">A mapping designed for best behavior when doing caseless
- matching of strings interpreted as identifiers. (Abbreviated name: NFKC_CF)
- <p>For the definition of the related string
- transform toNFKC_Casefold() based on this mapping, see <i>Section 3.13, Default
- Case Algorithms</i> in [<a href="../tr41/tr41-21.html#Unicode">Unicode</a>].</p>
- <p>The mapping is listed in Field 2.
- </td>
- </tr>
- <tr>
- <td valign="top" align="left"><a name="CWKCF" href="#CWKCF">Changes_When_NFKC_Casefolded</a></td>
- <td valign="top">B</td>
- <td valign="top">I</td>
- <td valign="top">Characters which are not identical to their NFKC_Casefold
- mapping.
- <p><i>Generated from: (cp != NFKC_CaseFold(cp))</i>
- </td>
- </tr>
-
- <tr>
- <th valign="top" align="LEFT" colspan="4">
- <a name="PropList.txt" href="#PropList.txt">PropList.txt</a></th>
- </tr>
- <tr>
- <td valign="top" align="left"><a name="ASCII_Hex_Digit" href="#ASCII_Hex_Digit">ASCII_Hex_Digit</a></td>
- <td valign="top">B</td>
- <td valign="top">N</td>
- <td valign="top">ASCII characters commonly used for the representation of hexadecimal numbers.</td>
- </tr>
- <tr>
- <td valign="top" align="left"><a name="Bidi_Control" href="#Bidi_Control">Bidi_Control</a></td>
- <td valign="top" align="center">B</td>
- <td valign="top">N</td>
- <td valign="top">Format control characters which have specific functions in the
- Unicode Bidirectional Algorithm [<a href="../tr41/tr41-21.html#UAX9">UAX9</a>].</td>
- </tr>
- <tr>
- <td valign="top" align="left"><a name="Dash" href="#Dash">Dash</a></td>
- <td valign="top" align="center">B</td>
- <td valign="top">I</td>
- <td valign="top">Punctuation characters explicitly called out as dashes in the Unicode
- Standard, plus their compatibility equivalents. Most of these have the General_Category value Pd,
- but some have the General_Category value Sm because of their use in mathematics.</td>
- </tr>
- <tr>
- <td valign="top" align="left"><a name="Deprecated" href="#Deprecated">Deprecated</a></td>
- <td valign="top">B</td>
- <td valign="top">N</td>
- <td valign="top">For a machine-readable list of deprecated characters. No characters will ever
- be removed from the standard, but the usage of deprecated characters is strongly discouraged.</td>
- </tr>
- <tr>
- <td valign="top" align="left"><a name="Diacritic" href="#Diacritic">Diacritic</a></td>
- <td valign="top" align="center">B</td>
- <td valign="top">I</td>
- <td valign="top">Characters that linguistically modify the meaning of another character to
- which they apply. Some diacritics are not combining characters, and some combining characters
- are not diacritics.</td>
- </tr>
- <tr>
- <td valign="top" align="left"><a name="Extender" href="#Extender">Extender</a></td>
- <td valign="top">B</td>
- <td valign="top">I</td>
- <td valign="top">Characters whose principal function is to extend the value or shape of a
- preceding alphabetic character. Typical of these are length and iteration marks.</td>
- </tr>
- <tr>
- <td valign="top" align="left"><a name="Hex_Digit" href="#Hex_Digit">Hex_Digit</a></td>
- <td valign="top">B</td>
- <td valign="top">I</td>
- <td valign="top">Characters commonly used for the representation of hexadecimal numbers, plus
- their compatibility equivalents.</td>
- </tr>
- <tr>
- <td valign="top" align="left"><a name="Hyphen" href="#Hyphen">Hyphen</a>
- (<a href="#Stabilized_Properties">Stabilized</a> as of 4.0.0;
- <a href="#Deprecated_Properties">Deprecated</a> as of 6.0.0)</td>
- <td valign="top">B</td>
- <td valign="top">I</td>
- <td valign="top">Dashes which are used to mark connections between pieces of words, plus the
- <i>Katakana middle dot</i>. The <i>Katakana middle dot</i> functions like a hyphen, but is shaped like a dot
- rather than a dash.</td>
- </tr>
- <tr>
- <td valign="top" align="left"><a name="Ideographic" href="#Ideographic">Ideographic</a></td>
- <td valign="top">B</td>
- <td valign="top">I</td>
- <td valign="top">Characters considered to be CJKV (Chinese, Japanese, Korean, and Vietnamese)
- or other siniform (Chinese writing-related) ideographs. This property roughly defines the class of
- "Chinese characters" and does not include characters of other
- logographic scripts such as Cuneiform or Egyptian Hieroglyphs. The
- Ideographic property is used in the definition of
- Ideographic Description Sequences.</td>
- </tr>
- <tr>
- <td valign="top" align="left"><a name="IDS_Binary_Operator" href="#IDS_Binary_Operator">IDS_Binary_Operator</a></td>
- <td valign="top">B</td>
- <td valign="top">N</td>
- <td valign="top">Used in Ideographic Description Sequences.</td>
- </tr>
- <tr>
- <td valign="top" align="left"><a name="IDS_Trinary_Operator" href="#IDS_Trinary_Operator">IDS_Trinary_Operator</a></td>
- <td valign="top">B</td>
- <td valign="top">N</td>
- <td valign="top">Used in Ideographic Description Sequences.</td>
- </tr>
- <tr>
- <td valign="top" align="left"><a name="Join_Control" href="#Join_Control">Join_Control</a></td>
- <td valign="top">B</td>
- <td valign="top">N</td>
- <td valign="top">Format control characters which have specific functions for control of
- cursive joining and ligation.</td>
- </tr>
- <tr>
- <td valign="top" align="left"><a name="Logical_Order_Exception" href="#Logical_Order_Exception">Logical_Order_Exception</a></td>
- <td valign="top">B</td>
- <td valign="top">N</td>
- <td valign="top">A small number of spacing vowel letters occurring in certain
- Southeast Asian scripts such as Thai and Lao, which use a visual order display
- model. These letters are stored in text ahead of syllable-initial consonants,
- and require special handling for processes such as searching and sorting.</td>
- </tr>
- <tr>
- <td valign="top" align="left"><a name="Noncharacter_Code_Point" href="#Noncharacter_Code_Point">Noncharacter_Code_Point</a></td>
- <td valign="top">B</td>
- <td valign="top">N</td>
- <td valign="top">Code points permanently reserved for internal use.</td>
- </tr>
- <tr>
- <td valign="top" align="left"><a name="Other_Alphabetic" href="#Other_Alphabetic">Other_Alphabetic</a></td>
- <td valign="top" align="center">B</td>
- <td valign="top">C</td>
- <td valign="top">Used in deriving the Alphabetic property.</td>
- </tr>
- <tr>
- <td valign="top" align="left"><a name="Other_Default_Ignorable_Code_Point" href="#Other_Default_Ignorable_Code_Point">
- Other_Default_Ignorable_Code_Point</a></td>
- <td valign="top">B</td>
- <td valign="top">C</td>
- <td valign="top">Used in deriving the Default_Ignorable_Code_Point property.</td>
- </tr>
- <tr>
- <td valign="top" align="left"><a name="Other_Grapheme_Extend" href="#Other_Grapheme_Extend">Other_Grapheme_Extend</a></td>
- <td valign="top" align="center">B</td>
- <td valign="top">C</td>
- <td valign="top">Used in deriving the Grapheme_Extend property.</td>
- </tr>
- <tr>
- <td valign="top" align="left"><a name="Other_ID_Continue" href="#Other_ID_Continue">Other_ID_Continue</a></td>
- <td valign="top">B</td>
- <td valign="top">C</td>
- <td valign="top">Used to maintain backward compatibility of <a href="#ID_Continue">ID_Continue</a>.</td>
- </tr>
- <tr>
- <td valign="top" align="left"><a name="Other_ID_Start" href="#Other_ID_Start">Other_ID_Start</a></td>
- <td valign="top">B</td>
- <td valign="top">C</td>
- <td valign="top">Used to maintain backward compatibility of <a href="#ID_Start">ID_Start</a>.</td>
- </tr>
- <tr>
- <td valign="top" align="left"><a name="Other_Lowercase" href="#Other_Lowercase">Other_Lowercase</a></td>
- <td valign="top">B</td>
- <td valign="top">C</td>
- <td valign="top">Used in deriving the Lowercase property.</td>
- </tr>
- <tr>
- <td valign="top" align="left"><a name="Other_Math" href="#Other_Math">Other_Math</a></td>
- <td valign="top">B</td>
- <td valign="top">C</td>
- <td valign="top">Used in deriving the Math property.</td>
- </tr>
- <tr>
- <td valign="top" align="left"><a name="Other_Uppercase" href="#Other_Uppercase">Other_Uppercase</a></td>
- <td valign="top">B</td>
- <td valign="top">C</td>
- <td valign="top">Used in deriving the Uppercase property.</td>
- </tr>
- <tr>
- <td><a name="Pattern_Syntax" href="#Pattern_Syntax">Pattern_Syntax</a></td>
- <td valign="top">B</td>
- <td valign="top">N</td>
- <td valign="top" rowspan="2">Used for pattern syntax as described in Unicode Standard Annex #31, "Unicode Identifier
- and Pattern Syntax" [<a href="../tr41/tr41-21.html#UAX31">UAX31</a>].</td>
- </tr>
- <tr>
- <td><a name="Pattern_White_Space" href="#Pattern_White_Space">Pattern_White_Space</a></td>
- <td valign="top">B</td>
- <td valign="top">N</td>
- </tr>
- <tr>
- <td><a name="Prepended_Concatenation_Mark" href="#Prepended_Concatenation_Mark">Prepended_Concatenation_Mark</a></td>
- <td valign="top">B</td>
- <td valign="top">I</td>
- <td valign="top">A small class of visible format controls, which precede and then span
- a sequence of other characters, usually digits. These have also been known as
- "subtending marks", because most of them take a form which visually extends underneath
- the sequence of following digits.</td>
- </tr>
- <tr>
- <td valign="top" align="left"><a name="Quotation_Mark" href="#Quotation_Mark">Quotation_Mark</a></td>
- <td valign="top">B</td>
- <td valign="top">I</td>
- <td valign="top">Punctuation characters that function as quotation marks.</td>
- </tr>
- <tr>
- <td valign="top" align="left"><a name="Radical" href="#Radical">Radical</a></td>
- <td valign="top">B</td>
- <td valign="top">N</td>
- <td valign="top">Used in the definition of Ideographic Description Sequences.</td>
- </tr>
- <tr>
- <td valign="top" align="left"><a name="Regional_Indicator" href="#Regional_Indicator">Regional_Indicator</a></td>
- <td valign="top">B</td>
- <td valign="top">N</td>
- <td valign="top">Property of the regional indicator characters, U+1F1E6..U+1F1FF. This
- property is referenced in various segmentation algorithms, to assist in correct
- breaking around emoji flag sequences.</td>
- </tr>
- <tr>
- <td valign="top" align="left"><a name="STerm" href="#STerm">Sentence_Terminal</a></td>
- <td valign="top">B</td>
- <td valign="top">I</td>
- <td valign="top">Punctuation characters that generally mark the end of sentences.
- Used in Unicode Standard Annex #29, "Unicode Text Segmentation"
- [<a href="../tr41/tr41-21.html#UAX29">UAX29</a>].</td>
- </tr>
- <tr>
- <td valign="top" align="left"><a name="Soft_Dotted" href="#Soft_Dotted">Soft_Dotted</a></td>
- <td valign="top" align="center">B</td>
- <td valign="top">N</td>
- <td valign="top">Characters with a "soft dot", like <i>i</i> or <i>j</i>. An accent placed on
- these characters causes the dot to disappear. An explicit <i>dot above</i> can be added where
- required, such as in Lithuanian. See <i>Section 7.1, Latin</i>
- in [<a href="../tr41/tr41-21.html#Unicode">Unicode</a>].</td>
- </tr>
- <tr>
- <td valign="top" align="left"><a name="Terminal_Punctuation" href="#Terminal_Punctuation">Terminal_Punctuation</a></td>
- <td valign="top" align="center">B</td>
- <td valign="top">I</td>
- <td valign="top">Punctuation characters that generally mark the end of textual units.</td>
- </tr>
- <tr>
- <td valign="top" align="left"><a name="Unified_Ideograph" href="#Unified_Ideograph">Unified_Ideograph</a></td>
- <td valign="top">B</td>
- <td valign="top">N</td>
- <td valign="top">A property which specifies
- the exact set of Unified CJK Ideographs in the standard. This set
- excludes CJK Compatibility Ideographs (which have canonical decompositions
- to Unified CJK Ideographs), as well as characters from the CJK
- Symbols and Punctuation block. The class of
- Unified_Ideograph=Y characters is a proper subset of the class of
- Ideographic=Y characters.</td>
- </tr>
- <tr>
- <td valign="top" align="left"><a name="Variation_Selector" href="#Variation_Selector">Variation_Selector</a></td>
- <td valign="top">B</td>
- <td valign="top">N</td>
- <td valign="top">Indicates characters that are Variation Selectors. For
- details on the behavior of these characters, see
- <i>Section 23.4, Variation Selectors</i> in [<a href="../tr41/tr41-21.html#Unicode">Unicode</a>],
- and Unicode Technical Standard #37, "Unicode Ideographic Variation Database" [<a href="../tr41/tr41-21.html#UTS37">UTS37</a>].</td>
- </tr>
- <tr>
- <td valign="top" align="left"><a name="White_Space" href="#White_Space">White_Space</a></td>
- <td valign="top">B</td>
- <td valign="top">N</td>
- <td valign="top">Spaces, separator characters and
- other control characters which should be treated by
- programming languages as "white space" for the purpose of parsing elements.
- See also <a href="#Line_Break">Line_Break</a>,
- <a href="#Grapheme_Cluster_Break">Grapheme_Cluster_Break</a>,
- <a href="#Sentence_Break">Sentence_Break</a>,
- and <a href="#Word_Break">Word_Break</a>, which classify space characters and related controls somewhat differently
- for particular text segmentation contexts.
- </td>
- </tr>
-
- <tr>
- <th valign="top" align="LEFT" colspan="4">
- <a name="UnicodeData.txt" href="#UnicodeData.txt">UnicodeData.txt</a></th>
- </tr>
- <tr>
- <td valign="top"><a name="Name" href="#Name">Name</a></td>
- <td valign="top" align="center">M</td>
- <td valign="top" align="center">N</td>
- <td valign="top">(1)
- When a string value not enclosed in <angle brackets>
- occurs in this field, it specifies the character's Name property value, which
- matches exactly the name published in
- the code charts.
- The Name property value for most ideographic characters and
- for Hangul syllables is derived instead by various rules. See <i>Section 4.8, Name</i> in
- [<a href="../tr41/tr41-21.html#Unicode">Unicode</a>] for a full specification of those
- rules. Strings enclosed in <angle brackets> in this field either provide label
- information used in the name derivation rules, or—in the case of characters
- which have a null string as their Name property value, such as control characters—provide
- other information about their code point type.
- </td>
- </tr>
- <tr>
- <td valign="top"><a name="General_Category" href="#General_Category">General_Category</a></td>
- <td valign="top" align="center">E</td>
- <td valign="top" align="center">N</td>
- <td valign="top">(2) This is a useful breakdown into various character types which can be used
- as a default categorization in implementations. For the property values, see
- <a href="#General_Category_Values">General Category Values</a>.</td>
- </tr>
- <tr>
- <td valign="top"><a name="Canonical_Combining_Class" href="#Canonical_Combining_Class">Canonical_Combining_Class</a></td>
- <td valign="top" align="center">N</td>
- <td valign="top" align="center">N</td>
- <td valign="top">(3) The classes used for the Canonical Ordering Algorithm in the Unicode
- Standard. This property could be considered either an
- enumerated property or a numeric property: the principal use of the property is in
- terms of the numeric values. For the property value names associated with different numeric values, see
- <a href="#DerivedCombiningClass.txt">DerivedCombiningClass.txt</a> and <a href="#Canonical_Combining_Class_Values">Canonical Combining
- Class Values</a>.</td>
- </tr>
- <tr>
- <td valign="top"><a name="Bidi_Class" href="#Bidi_Class">Bidi_Class</a></td>
- <td valign="top" align="center">E</td>
- <td valign="top" align="center">N</td>
- <td valign="top">(4) These are the categories required by the Unicode Bidirectional Algorithm.
- For the property values, see <a href="#Bidi_Class_Values">Bidirectional Class
- Values</a>. For more information, see Unicode Standard Annex #9, "Unicode Bidirectional Algorithm"
- [<a href="../tr41/tr41-21.html#UAX9">UAX9</a>].<p>
- The default property values depend on the code point, and are explained in
- DerivedBidiClass.txt</td>
- </tr>
- <tr>
- <td valign="top"><a name="Decomposition_Type" href="#Decomposition_Type">Decomposition_Type</a><br>
- <a name="Decomposition_Mapping" href="#Decomposition_Mapping">Decomposition_Mapping</a></td>
- <td valign="top" align="center">E, S</td>
- <td valign="top" align="center">N</td>
- <td valign="top">(5) This field contains both values, with the type in angle brackets. The
- decomposition mappings exactly match the decomposition mappings published with the character
- names in the Unicode Standard. For more information, see
- <a href="#Character_Decomposition_Mappings">Character Decomposition Mappings</a>.
- </td>
- </tr>
- <tr>
- <td valign="top" rowspan="3"><a name="Numeric_Type" href="#Numeric_Type">Numeric_Type</a><br>
- <a name="Numeric_Value" href="#Numeric_Value">Numeric_Value</a></td>
- <td valign="top" align="center">E, N</td>
- <td valign="top" align="center">N</td>
- <td valign="top">(6) If the character has the
- property value Numeric_Type=Decimal, then the
- Numeric_Value of that digit is represented with an integer
- value (limited to the range 0..9) in fields 6, 7, and 8.
- Characters with the property value Numeric_Type=Decimal are
- restricted to digits which can be used in a decimal radix positional numeral system and
- which are encoded in the standard in a contiguous ascending range 0..9. See the discussion of
- <i>decimal digits</i> in <i>Chapter 4, Character Properties</i> in [<a href="../tr41/tr41-21.html#Unicode">Unicode</a>].</td>
- </tr>
- <tr>
- <td valign="top" align="center">E, N</td>
- <td valign="top" align="center">N</td>
- <td valign="top">(7) If the character has the
- property value Numeric_Type=Digit, then the
- Numeric_Value of that digit is represented with an
- integer value (limited to the range 0..9) in fields 7 and 8, and field 6 is null.
- This covers digits that need special handling, such as the compatibility superscript digits.
- <p>Starting with Unicode 6.3.0, no newly encoded numeric characters will be
- given Numeric_Type=Digit, nor will existing characters with Numeric_Type=Numeric be changed
- to Numeric_Type=Digit. The distinction between those two types is not considered useful.</p></td>
- </tr>
- <tr>
- <td valign="top" align="center">E, N</td>
- <td valign="top" align="center">N</td>
- <td valign="top">(8) If the character has the
- property value Numeric_Type=Numeric, then the
- Numeric_Value of that character is represented with a positive or
- negative integer or rational number in this field, and
- fields 6 and 7 are null. This includes fractions such as, for example, "1/5" for
- U+2155 VULGAR FRACTION ONE FIFTH.
- <p>Some characters have these properties based on values from the Unihan data files. See
- <a href="#Numeric_Type_Han">Numeric_Type, Han</a>.</p></td>
- </tr>
- <tr>
- <td valign="top"><a name="Bidi_Mirrored" href="#Bidi_Mirrored">Bidi_Mirrored</a></td>
- <td valign="top" align="center">B</td>
- <td valign="top" align="center">N</td>
- <td valign="top">(9) If the character is a "mirrored" character in
- bidirectional text, this field has the value "Y"; otherwise "N".
- See <i>Section 4.7, Bidi Mirrored</i> of [<a href="../tr41/tr41-21.html#Unicode">Unicode</a>]. <i>Do not confuse this with
- the <a href="#Bidi_Mirroring_Glyph">Bidi_Mirroring_Glyph</a> property.</i></td>
- </tr>
- <tr>
- <td valign="top"><a name="Unicode_1_Name" href="#Unicode_1_Name">Unicode_1_Name</a>
- (<a href="#Obsolete_Properties">Obsolete</a> as of 6.2.0)</td>
- <td valign="top" align="center">M</td>
- <td valign="top" align="center">I</td>
- <td valign="top">(10) Old name as published in Unicode 1.0 or
- ISO 6429 names for control functions. This field is empty unless it is significantly
- different from the current name for the character.
- No longer used in code chart production. See <a href="#Name_Alias">Name_Alias</a>.
- </td>
- </tr>
- <tr>
- <td valign="top"><a name="ISO_Comment" href="#ISO_Comment">ISO_Comment</a>
- (<a href="#Obsolete_Properties">Obsolete</a> as of 5.2.0;
- <a href="#Deprecated_Properties">Deprecated</a> and <a href="#Stabilized_Properties">Stabilized</a>
- as of 6.0.0)</td>
- <td valign="top" align="center">M</td>
- <td valign="top" align="center">I</td>
- <td valign="top">(11) ISO 10646 comment field. It
- was used for notes that appeared in parentheses in the
- 10646 names list, or contained an asterisk to mark an Annex P note.
- <p>As of Unicode 5.2.0, this field no longer contains any non-null values.</p>
- </td>
- </tr>
- <tr>
- <td valign="top"><a name="Simple_Uppercase_Mapping" href="#Simple_Uppercase_Mapping">Simple_Uppercase_Mapping</a></td>
- <td valign="top" align="center">S</td>
- <td valign="top" align="center">N</td>
- <td valign="top">(12) Simple uppercase mapping (single character result).
- If a character is
- part of an alphabet with case distinctions, and has a simple uppercase equivalent, then the
- uppercase equivalent is in this field. The
- simple mappings have a single character result, where the full mappings may have
- multi-character results. For more information, see <a href="#Casemapping">Case and Case Mapping</a>.
- </td>
- </tr>
- <tr>
- <td valign="top"><a name="Simple_Lowercase_Mapping" href="#Simple_Lowercase_Mapping">Simple_Lowercase_Mapping</a></td>
- <td valign="top" align="center">S</td>
- <td valign="top" align="center">N</td>
- <td valign="top">(13) Simple lowercase mapping (single character result).
- </td>
- </tr>
- <tr>
- <td><a name="Simple_Titlecase_Mapping" href="#Simple_Titlecase_Mapping">Simple_Titlecase_Mapping</a></td>
- <td valign="top" align="center">S</td>
- <td valign="top" align="center">N</td>
- <td valign="top">(14) Simple titlecase mapping (single character result).
- <p><b>Note:</b> If this
- field is null, then the Simple_Titlecase_Mapping is the same as the
- Simple_Uppercase_Mapping for this character.</p></td>
- </tr>
- <tr>
- <th colspan="4">
- <a name="VerticalOrientation.txt" href="#VerticalOrientation.txt">VerticalOrientation.txt</a></th>
- </tr>
- <tr>
- <td><a name="Vertical_Orientation" href="#Vertical_Orientation">Vertical_Orientation</a></td>
- <td>E</td>
- <td>I</td>
- <td>A property used to establish a default for the correct orientation of characters
- when used in vertical text layout, as described in Unicode Standard Annex #50,
- "Unicode Vertical Text Layout"
- [<a href="../tr41/tr41-21.html#UAX50">UAX50</a>].</td>
- </tr>
- </table>
- <p> </p>
- <h3>5.4 <a name="Derived_Extracted" href="#Derived_Extracted">Derived Extracted Properties</a></h3>
- <p>A number of Unicode character properties have been separated out, reformatted,
- and listed in range format, one property per file. These files
- are located under the <i>extracted</i> directory of the UCD.
- The exact list of derived extracted files and the extracted properties they
- represent are given in <a href="#Extracted_Properties_Table"><i>Table 10</i></a>.</p>
-
- <p>The derived extracted files are provided
- primarily as a reformatting of data for properties specified in other data files.
- For <i>nondefault</i> values of properties, if there is
- any inadvertant mismatch between the primary data files specifying
- those properties and these lists of extracted properties, the primary
- data files are taken as definitive. However, for <i>default</i> values
- of properties, the extracted data files are definitive. This is particularly true for properties
- which have multiple default values; those properties are identified with an asterisk
- in the table. See Section 4.2.9, <a href="#Default_Values">Default Values</a>.</p>
- <p class="caption">Table 10. <a name="Extracted_Properties_Table" href="#Extracted_Properties_Table">Extracted Properties</a></p>
- <div align="center">
- <table class="simple">
- <tr>
- <th>File</th>
- <th>Status</th>
- <th>Property</th>
- <th>Extracted from</th>
- </tr>
- <tr>
- <td>DerivedBidiClass.txt</td>
- <td style="text-align:center">N</td>
- <td>Bidi_Class*</td>
- <td>UnicodeData.txt, field 4</td>
- </tr>
- <tr>
- <td>DerivedBinaryProperties.txt</td>
- <td style="text-align:center">N</td>
- <td>Bidi_Mirrored</td>
- <td>UnicodeData.txt, field 9</td>
- </tr>
- <tr>
- <td><a name="DerivedCombiningClass.txt"></a>DerivedCombiningClass.txt</td>
- <td style="text-align:center">N</td>
- <td>Canonical_Combining_Class</td>
- <td>UnicodeData.txt, field 3</td>
- </tr>
- <tr>
- <td>DerivedDecompositionType.txt</td>
- <td style="text-align:center">N/I</td>
- <td>Decomposition_Type</td>
- <td>the <tag> in UnicodeData.txt, field 5</td>
- </tr>
- <tr>
- <td>DerivedEastAsianWidth.txt</td>
- <td style="text-align:center">I</td>
- <td>East_Asian_Width*</td>
- <td>EastAsianWidth.txt, field 1</td>
- </tr>
- <tr>
- <td>DerivedGeneralCategory.txt</td>
- <td style="text-align:center">N</td>
- <td>General_Category</td>
- <td>UnicodeData.txt, field 2</td>
- </tr>
- <tr>
- <td>DerivedJoiningGroup.txt</td>
- <td style="text-align:center">N</td>
- <td>Joining_Group</td>
- <td>ArabicShaping.txt, field 3</td>
- </tr>
- <tr>
- <td>DerivedJoiningType.txt</td>
- <td style="text-align:center">N</td>
- <td>Joining_Type*</td>
- <td>ArabicShaping.txt, field 2</td>
- </tr>
- <tr>
- <td>DerivedLineBreak.txt</td>
- <td style="text-align:center">N</td>
- <td>Line_Break*</td>
- <td>LineBreak.txt, field 1</td>
- </tr>
- <tr>
- <td>DerivedName.txt</td>
- <td style="text-align:center">N</td>
- <td>Name</td>
- <td>UnicodeData.txt, field 1</td>
- </tr>
- <tr>
- <td>DerivedNumericType.txt</td>
- <td style="text-align:center">N</td>
- <td>Numeric_Type</td>
- <td>UnicodeData.txt, fields 6 through 8</td>
- </tr>
- <tr>
- <td>DerivedNumericValues.txt</td>
- <td style="text-align:center">N</td>
- <td>Numeric_Value</td>
- <td>UnicodeData.txt, field 8</td>
- </tr>
- </table>
- </div>
-
- <p>For the extraction of Decomposition_Type, characters with canonical
- decomposition mappings in field 5 of UnicodeData.txt have no tag. For
- those characters, the extracted value is Decomposition_Type=Canonical. For characters
- with compatibility decomposition mappings, there are explicit tags
- in field 5, and the value of Decomposition_Type
- is equivalent to those tags. The value Decomposition_Type=Canonical is
- normative. Other values for Decomposition_Type are informative.</p>
- <p>The value of the Name property is extracted based on the actual string value
- of the data in field 1 of UnicodeData.txt, omitting any code points
- with the default null string value. Then for code points in the
- Hangul Syllables block, the Hangul
- Syllable Name Generation algorithm defined in <i>Section 3.12, Conjoining
- Jamo Behavior</i> of [<a href="../tr41/tr41-21.html#Unicode">Unicode</a>]
- is applied, to create the explicit formal
- names of all Hangul syllables. Characters whose names are algorithmically
- defined based on suffixing the code point to a specific identifying
- string prefix, such as CJK UNIFIED IDEOGRAPH-4E00, are listed with
- a compact range convention in DerivedName.txt, using an
- asterisk "*" character as the placeholder for the code point.
- See <i>Section 4.8, Name</i> of [<a href="../tr41/tr41-21.html#Unicode">Unicode</a>]
- for more information about how the Name property is derived.</p>
-
- <p>Numeric_Value is extracted based on the actual numeric value of the
- data in field 8 of UnicodeData.txt or the values
- of the kPrimaryNumeric, kAccountingNumeric, or kOtherNumeric tags, for
- characters listed in the Unihan data files.</p>
-
- <p>Numeric_Type is extracted as follows. If fields 6, 7, and 8 in UnicodeData.txt
- are all non-empty, then Numeric_Type=Decimal. Otherwise, if fields 7 and 8 are both
- non-empty, then Numeric_Type=Digit. Otherwise, if field 8 is non-empty, then
- Numeric_Type=Numeric.
- For characters listed in the Unihan data files,
- Numeric_Type=Numeric for characters that have kPrimaryNumeric, kAccountingNumeric,
- or kOtherNumeric tags. The default value is Numeric_Type=None.</p>
-
- <h3>5.5 <a name="Contributory_Properties" href="#Contributory_Properties">Contributory Properties</a></h3>
- <p>Contributory properties contain sets of exceptions used in the generation of
- other properties derived from them. The contributory properties specifically concerned with
- identifiers and casing contribute to the maintenance of
- stability guarantees for properties and/or to invariance relationships
- between related properties. Other contributory properties are simply
- defined as a convenience for property derivation.</p>
-
- <p>Most contributory properties have names using
- the pattern "Other_XXX" and are used to derive the corresponding "XXX" property.
- For example, the Other_Alphabetic property is used in the derivation of the <a href="#Alphabetic">Alphabetic</a>
- property.</p>
-
- <p>Contributory properties are typically defined in
- <a href="#PropList.txt">PropList.txt</a> and the corresponding derived property
- is then listed in
- <a href="#DerivedCoreProperties.txt">DerivedCoreProperties.txt</a>.</p>
-
- <p><a href="#Jamo_Short_Name">Jamo_Short_Name</a> is an unusual contributory
- property, both in terms of its name and how it is used. It is defined in
- its own property file, Jamo.txt, and is used to derive the Name
- property value for Hangul syllable characters, according to the rules
- spelled out in <i>Section 3.12, Conjoining Jamo Behavior</i> in
- [<a href="../tr41/tr41-21.html#Unicode">Unicode</a>].</p>
-
- <p><i>Contributory</i> is considered to be a distinct status for a Unicode
- character property. Contributory properties are neither <i>normative</i> nor
- <i>informative</i>. This distinct status is marked with
- the symbol "C" in the status column in the property table.
- For convenience of reference, all contributory properties are also listed
- in <a href="#Contributory_Properties_Table"><i>Table 10a</i></a>, along with the
- properties whose derivation they contribute to.</p>
-
- <p class="caption">Table 10a. <a name="Contributory_Properties_Table" href="#Contributory_Properties_Table">Contributory Properties</a></p>
- <div align="center">
- <table class="simple">
- <tr>
- <th>File</th>
- <th>Property</th>
- <th>Used in Derivation of</th>
- </tr>
- <tr>
- <td>Jamo.txt</td>
- <td>Jamo_Short_Name</td>
- <td>Name</td>
- </tr>
- <tr>
- <td rowspan="8" style="vertical-align:middle">PropList.txt</td>
- <td>Other_Alphabetic</td>
- <td>Alphabetic</td>
- </tr>
- <tr>
- <td>Other_Default_Ignorable_Code_Point</td>
- <td>Default_Ignorable_Code_Point</td>
- </tr>
- <tr>
- <td>Other_Grapheme_Extend</td>
- <td>Grapheme_Extend</td>
- </tr>
- <tr>
- <td>Other_ID_Start</td>
- <td>ID_Start, XID_Start</td>
- </tr>
- <tr>
- <td>Other_ID_Continue</td>
- <td>ID_Continue, XID_Continue</td>
- </tr>
- <tr>
- <td>Other_Lowercase</td>
- <td>Lowercase</td>
- </tr>
- <tr>
- <td>Other_Math</td>
- <td>Math</td>
- </tr>
- <tr>
- <td>Other_Uppercase</td>
- <td>Uppercase</td>
- </tr>
- </table>
- </div>
- <p>Contributory properties are
- incomplete by themselves and are not intended for independent use. For example,
- an API returning Unicode property values should implement the derived
- core properties such as Alphabetic or Default_Ignorable_Code_Point,
- rather than the corresponding contributory properties,
- Other_Alphabetic or Other_Default_Ignorable_Code_Point.</p>
-
-
- <h3>5.6 <a name="Casemapping" href="#Casemapping">Case and Case Mapping</a></h3>
- <p>Case for bicameral scripts and case mapping of characters are
- complicated topics in the Unicode Standard—both because of
- their inherent algorithmic complexity and because of the number of characters
- and special edge cases involved.</p>
-
- <p>This section provides a brief roadmap to discussions about these
- topics, and specifications and definitions in the standard, as well
- as explaining which case-related properties are defined in the UCD.</p>
-
- <p><i>Section 3.13, Default Case Algorithms</i> in
- [<a href="../tr41/tr41-21.html#Unicode">Unicode</a>]
- provides formal definitions for a number of case-related concepts (<i>cased</i>,
- <i>case-ignorable</i>, ...), for
- case conversion (<i>toUppercase(X)</i>, ...), and for case detection
- (<i>isUppercase(X)</i>, ...). It also provides the formal definition
- of caseless matching for the standard, taking normalization
- into account.</p>
-
- <p><i>Section 4.2, Case</i> in
- [<a href="../tr41/tr41-21.html#Unicode">Unicode</a>]
- introduces case and case mapping properties. <i>Table 4-3, Sources
- for Case Mapping Information</i>
- in [<a href="../tr41/tr41-21.html#Unicode">Unicode</a>] describes the kind of case-related
- information that is available in various data files of the UCD.
- <i>Table 11</i> lists those data files again, giving the
- explicit list of case-related properties defined in each.
- The link on each property leads its description in
- <i>Table 9, <a href="#Property_List_Table">Property Table</a></i>.</p>
-
- <p class="caption">Table 11. <a name="Case_Properties_Table" href="#Case_Properties_Table">UCD Files and Case Properties</a></p>
- <div align="center">
- <table class="simple">
- <tr>
- <th>File Name</th>
- <th>Case Properties</th>
- </tr>
- <tr>
- <td>UnicodeData.txt</td>
- <td><a href="#Simple_Uppercase_Mapping">Simple_Uppercase_Mapping</a>,
- <a href="#Simple_Lowercase_Mapping">Simple_Lowercase_Mapping</a>,
- <a href="#Simple_Titlecase_Mapping">Simple_Titlecase_Mapping</a></td>
- </tr>
- <tr>
- <td>SpecialCasing.txt</td>
- <td><a href="#Uppercase_Mapping">Uppercase_Mapping</a>,
- <a href="#Lowercase_Mapping">Lowercase_Mapping</a>,
- <a href="#Titlecase_Mapping">Titlecase_Mapping</a></td>
- </tr>
- <tr>
- <td>CaseFolding.txt</td>
- <td><a href="#Simple_Case_Folding">Simple_Case_Folding</a>,
- <a href="#Case_Folding">Case_Folding</a></td>
- </tr>
- <tr>
- <td>DerivedCoreProperties.txt</td>
- <td><a href="#Uppercase">Uppercase</a>,
- <a href="#Lowercase">Lowercase</a>,
- <a href="#Cased">Cased</a>,
- <a href="#Case_Ignorable">Case_Ignorable</a>,
- <a href="#CWL">Changes_When_Lowercased</a>,
- <a href="#CWU">Changes_When_Uppercased</a>,
- <a href="#CWT">Changes_When_Titlecased</a>,
- <a href="#CWCF">Changes_When_Casefolded</a>,
- <a href="#CWCM">Changes_When_Casemapped</a>
- </td>
- </tr>
- <tr>
- <td>DerivedNormalizationProps.txt</td>
- <td><a href="#NFKC_Casefold">NFKC_Casefold</a>,
- <a href="#CWKCF">Changes_When_NFKC_Casefolded</a></td>
- </tr>
- <tr>
- <td>PropList.txt</td>
- <td><a href="#Soft_Dotted">Soft_Dotted</a>,
- <a href="#Other_Uppercase">Other_Uppercase</a>,
- <a href="#Other_Lowercase">Other_Lowercase</a></td>
- </tr>
- </table>
- </div>
-
- <p>For compatibility with existing parsers, UnicodeData.txt only
- contains case mappings for characters where they constitute one-to-one mappings;
- it also omits
- information about context-sensitive case mappings. Information about
- these special cases can be found in the separate data file,
- SpecialCasing.txt, expressed as separate properties.</p>
- <p><i>Section 5.18, Case Mappings</i>, in
- [<a href="../tr41/tr41-21.html#Unicode">Unicode</a>]
- discusses various implementation issues for handling case,
- including language-specific case mapping, as for Greek and
- for Turkish. That section also describes case folding in particular detail.</p>
-
- <p>The special casing conditions associated with case mapping for Greek,
- Turkish, and Lithuanian are specified in an additional field in
- <a href="#SpecialCasing.txt">SpecialCasing.txt</a>. For example, the
- lowercase mapping for sigma in Greek varies according to its position
- in a word. The condition list does not constitute a formal character
- property in the UCD, because it is a statement about the context of occurrence
- of casing behavior for a character or characters, rather than a semantic
- attribute of those characters. Versions of the UCD from
- Version 3.2.0 to Version 5.0.0 <i>did</i> list property aliases
- for Special_Case_Condition (scc), but this was determined to be an error
- when the UCD was analyzed for representation in XML; consequently,
- the Special_Case_Condition property aliases were removed as of Version 5.1.0.</p>
-
- <p>Caseless matching is of particular concern for a number of text
- processing algorithms, so is also discussed at some length
- in Unicode Standard Annex #31, "Unicode Identifier and Pattern Syntax"
- [<a href="../tr41/tr41-21.html#UAX31">UAX31</a>] and
- in Unicode Technical Standard #10, "Unicode Collation Algorithm"
- [<a href="../tr41/tr41-21.html#UTS10">UTS10</a>].</p>
-
- <p>Further information about locale-specific casing conventions
- can be found in the Unicode Common Locale Data Repository
- [<a href="../tr41/tr41-21.html#CLDR">CLDR</a>].</p>
-
- <h3>5.7 <a name="Property_Values" href="#Property_Values">Property Value Lists</a></h3>
- <p>The following subsections give summaries of property values for certain
- Enumeration properties. Other property values
- are documented in other, topically-specific annexes; for example,
- the Line_Break property values are documented in
- Unicode Standard Annex #14, "Unicode Line Breaking Algorithm"
- [<a href="../tr41/tr41-21.html#UAX14">UAX14</a>] and the
- various segmentation-related property values are documented in
- Unicode Standard Annex #29, "Unicode Text Segmentation"
- [<a href="../tr41/tr41-21.html#UAX29">UAX29</a>].</p>
-
- <h4>5.7.1 <a name="General_Category_Values" href="#General_Category_Values">General Category Values</a></h4>
-
- <p>The General_Category property of a code point provides for the
- most general classification of that code point. It is usually
- determined based on the primary characteristic of the assigned
- character for that code point. For example, is the character a letter,
- a mark, a number, punctuation, or a symbol, and if so, of what
- type? Other General_Category values define the classification of
- code points which are not assigned to regular graphic characters,
- including such statuses as private-use, control, surrogate code
- point, and reserved unassigned.</p>
-
- <p>Many characters have multiple uses, and not all such cases
- can be captured entirely by the General_Category value. For example,
- the General_Category value of Latin, Greek, or Hebrew letters does not
- attempt to cover (or preclude) the numerical use of such letters
- as Roman numerals or in other numerary systems. Conversely, the
- General_Category of ASCII digits 0..9 as Nd (decimal digit)
- neither attempts to cover (or preclude) the occasional use of
- these digits as letters in various orthographies. The General_Category
- is simply the first-order, most usual categorization of a
- character.</p>
-
- <p>For more information about the General_Category
- property, see <i>Chapter 4, Character Properties</i> in [<a href="../tr41/tr41-21.html#Unicode">Unicode</a>].</p>
- <p>The values in the General_Category field in UnicodeData.txt
- make use of the short, abbreviated property value aliases
- for General_Category. For convenience in reference, <i>Table 12</i>
- lists all the abbreviated and long value aliases for General_Category values, reproduced from
- <a href="#PropertyValueAliases.txt">PropertyValueAliases.txt</a>,
- along with a brief description of each category.</p>
-
- <p class="caption">Table 12. <a name="GC_Values_Table" href="#GC_Values_Table">General_Category Values</a></p>
- <div align="center">
- <table class="simple">
- <tr>
- <th>Abbr</th>
- <th>Long</th>
- <th>Description</th>
- </tr>
- <tr>
- <td>Lu</td>
- <td>Uppercase_Letter</td>
- <td>an uppercase letter</td>
- </tr>
- <tr>
- <td>Ll</td>
- <td>Lowercase_Letter</td>
- <td>a lowercase letter</td>
- </tr>
- <tr>
- <td>Lt</td>
- <td>Titlecase_Letter</td>
- <td>a digraphic character, with first part uppercase</td>
- </tr>
- <tr class="lightblue">
- <td>LC</td>
- <td>Cased_Letter</td>
- <td>Lu | Ll | Lt</td>
- </tr>
- <tr>
- <td>Lm</td>
- <td>Modifier_Letter</td>
- <td>a modifier letter</td>
- </tr>
- <tr>
- <td>Lo</td>
- <td>Other_Letter</td>
- <td>other letters, including syllables and ideographs</td>
- </tr>
- <tr class="lightblue">
- <td>L</td>
- <td>Letter</td>
- <td>Lu | Ll | Lt | Lm | Lo</td>
- </tr>
- <tr>
- <td>Mn</td>
- <td>Nonspacing_Mark</td>
- <td>a nonspacing combining mark (zero advance width)</td>
- </tr>
- <tr>
- <td>Mc</td>
- <td>Spacing_Mark</td>
- <td>a spacing combining mark (positive advance width)</td>
- </tr>
- <tr>
- <td>Me</td>
- <td>Enclosing_Mark</td>
- <td>an enclosing combining mark</td>
- </tr>
- <tr class="lightblue">
- <td>M</td>
- <td>Mark</td>
- <td>Mn | Mc | Me</td>
- </tr>
- <tr>
- <td>Nd</td>
- <td>Decimal_Number</td>
- <td>a decimal digit</td>
- </tr>
- <tr>
- <td>Nl</td>
- <td>Letter_Number</td>
- <td>a letterlike numeric character</td>
- </tr>
- <tr>
- <td>No</td>
- <td>Other_Number</td>
- <td>a numeric character of other type</td>
- </tr>
- <tr class="lightblue">
- <td>N</td>
- <td>Number</td>
- <td>Nd | Nl | No</td>
- </tr>
- <tr>
- <td>Pc</td>
- <td>Connector_Punctuation</td>
- <td>a connecting punctuation mark, like a tie</td>
- </tr>
- <tr>
- <td>Pd</td>
- <td>Dash_Punctuation</td>
- <td>a dash or hyphen punctuation mark</td>
- </tr>
- <tr>
- <td>Ps</td>
- <td>Open_Punctuation</td>
- <td>an opening punctuation mark (of a pair)</td>
- </tr>
- <tr>
- <td>Pe</td>
- <td>Close_Punctuation</td>
- <td>a closing punctuation mark (of a pair)</td>
- </tr>
- <tr>
- <td>Pi</td>
- <td>Initial_Punctuation</td>
- <td>an initial quotation mark</td>
- </tr>
- <tr>
- <td>Pf</td>
- <td>Final_Punctuation</td>
- <td>a final quotation mark</td>
- </tr>
- <tr>
- <td>Po</td>
- <td>Other_Punctuation</td>
- <td>a punctuation mark of other type</td>
- </tr>
- <tr class="lightblue">
- <td>P</td>
- <td>Punctuation</td>
- <td>Pc | Pd | Ps | Pe | Pi | Pf | Po</td>
- </tr>
- <tr>
- <td>Sm</td>
- <td>Math_Symbol</td>
- <td>a symbol of mathematical use</td>
- </tr>
- <tr>
- <td>Sc</td>
- <td>Currency_Symbol</td>
- <td>a currency sign</td>
- </tr>
- <tr>
- <td>Sk</td>
- <td>Modifier_Symbol</td>
- <td>a non-letterlike modifier symbol</td>
- </tr>
- <tr>
- <td>So</td>
- <td>Other_Symbol</td>
- <td>a symbol of other type</td>
- </tr>
- <tr class="lightblue">
- <td>S</td>
- <td>Symbol</td>
- <td>Sm | Sc | Sk | So</td>
- </tr>
- <tr>
- <td>Zs</td>
- <td>Space_Separator</td>
- <td>a space character (of various non-zero widths)</td>
- </tr>
- <tr>
- <td>Zl</td>
- <td>Line_Separator</td>
- <td>U+2028 LINE SEPARATOR only</td>
- </tr>
- <tr>
- <td>Zp</td>
- <td>Paragraph_Separator</td>
- <td>U+2029 PARAGRAPH SEPARATOR only</td>
- </tr>
- <tr class="lightblue">
- <td>Z</td>
- <td>Separator</td>
- <td>Zs | Zl | Zp</td>
- </tr>
- <tr>
- <td>Cc</td>
- <td>Control</td>
- <td>a C0 or C1 control code</td>
- </tr>
- <tr>
- <td>Cf</td>
- <td>Format</td>
- <td>a format control character</td>
- </tr>
- <tr>
- <td>Cs</td>
- <td>Surrogate</td>
- <td>a surrogate code point</td>
- </tr>
- <tr>
- <td>Co</td>
- <td>Private_Use</td>
- <td>a private-use character</td>
- </tr>
- <tr>
- <td>Cn</td>
- <td>Unassigned</td>
- <td>a reserved unassigned code point or a noncharacter</td>
- </tr>
- <tr class="lightblue">
- <td>C</td>
- <td>Other</td>
- <td>Cc | Cf | Cs | Co | Cn</td>
- </tr>
- </table>
- </div>
-
- <p>Note that the value gc=Cn does not actually
- occur in UnicodeData.txt, because that data file does not list
- unassigned code points.</p>
-
- <p>The distinctions between some General_Category values
- are somewhat arbitrary for edge cases, particularly those involving
- symbols and punctuation. For example, a number of multiple-function
- ASCII characters, including "@", "#", "%", and "&", have long
- been classified as Other_Punctuation (gc=Po), although they
- are not among the characters used as punctuation marks in traditional
- Western typography. Other characters may also be ambiguous between
- functioning to organize and delimit textual units (punctuation-like)
- or to represent concepts (symbol-like). Likewise, it may not always
- be clear whether some symbols are primarily used for mathematics
- or whether they are general symbols with occasional or even common use in mathematics.
- For example, many arrow symbols are classed as Other_Symbol,
- although they are widely used in mathematics. The
- General_Category values constitute a rough partitioning of characters
- to make distinctions for algorithmic processing, but do not
- provide a definitive classification for such overlapping
- or ambiguous usage of characters.</p>
-
- <p>Characters with the quotation-related General_Category values
- Pi or Pf may behave like opening punctuation (gc=Ps) or closing
- punctuation (gc=Pe), depending on usage and quotation conventions.</p>
-
- <p>General_Category values in the table highlighted
- in light blue (LC, L, M, N, P, S, Z, C) stand for groupings of related
- General_Category values. The classes they represent can be derived by
- unions of the relevant simple values, as shown in the table. The abbreviated
- and long value aliases for these classes are provided as a convenience
- for implementations, such as regex, which may wish to match more generic
- categories, such as "letter" or "number", rather than the detailed
- subtypes for General_Category. These aliases for groupings
- of General_Category values do not occur in UnicodeData.txt, which instead
- always specifies the enumerated subtype for the General_Category of a character.</p>
-
- <p>The symbol "L&" is a label used to stand for any
- combination of uppercase, lowercase or titlecase letters
- (Lu, Ll, or Lt), in the first part of comments in the data files of the UCD.
- It is equivalent to gc=LC, but is only a label in comments, and is
- not expected to be used as an identifier for regular expression matching.</p>
-
- <p>The Unicode Standard does not assign nondefault property
- values to control characters (gc=Cc), except
- for certain well-defined exceptions involving the Unicode Bidirectional Algorithm,
- the Unicode Line Breaking Algorithm, and Unicode Text Segmentation.
- Also, implementations will usually assign
- behavior to certain line breaking control
- characters—most notably U+000D and U+000A (CR and LF)—according to platform conventions.
- See <i>Section 5.8, Newline Guidelines</i> in
- [<a href="../tr41/tr41-21.html#Unicode">Unicode</a>] for more information.</p>
-
- <h4>5.7.2 <a name="Bidi_Class_Values" href="#Bidi_Class_Values">Bidirectional Class Values</a></h4>
-
- <p>The values in the Bidi_Class field in UnicodeData.txt
- make use of the short, abbreviated property value aliases
- for Bidi_Class. For convenience in reference, <i>Table 13</i>
- lists all the abbreviated and long value aliases for Bidi_Class values, reproduced from
- <a href="#PropertyValueAliases.txt">PropertyValueAliases.txt</a>,
- along with a brief description of each category.</p>
-
- <p class="caption">Table 13. <a name="BC_Values_Table" href="#BC_Values_Table">Bidi_Class Values</a></p>
- <div align="center">
- <table class="simple">
- <tr>
- <th>Abbr</th>
- <th>Long</th>
- <th>Description</th>
- </tr>
- <tr class="lightblue">
- <td colspan="3" align="center">Strong Types</td>
- </tr>
- <tr>
- <td>L</td>
- <td>Left_To_Right</td>
- <td>any strong left-to-right character</td>
- </tr>
- <tr>
- <td>R</td>
- <td>Right_To_Left</td>
- <td>any strong right-to-left (non-Arabic-type) character</td>
- </tr>
- <tr>
- <td>AL</td>
- <td>Arabic_Letter</td>
- <td>any strong right-to-left (Arabic-type) character</td>
- </tr>
- <tr class="lightblue">
- <td colspan="3" align="center">Weak Types</td>
- </tr>
- <tr>
- <td>EN</td>
- <td>European_Number</td>
- <td>any ASCII digit or Eastern Arabic-Indic digit</td>
- </tr>
- <tr>
- <td>ES</td>
- <td>European_Separator</td>
- <td>plus and minus signs</td>
- </tr>
- <tr>
- <td>ET</td>
- <td>European_Terminator</td>
- <td>a terminator in a numeric format context, includes currency signs</td>
- </tr>
- <tr>
- <td>AN</td>
- <td>Arabic_Number</td>
- <td>any Arabic-Indic digit</td>
- </tr>
- <tr>
- <td>CS</td>
- <td>Common_Separator</td>
- <td>commas, colons, and slashes</td>
- </tr>
- <tr>
- <td>NSM</td>
- <td>Nonspacing_Mark</td>
- <td>any nonspacing mark</td>
- </tr>
- <tr>
- <td>BN</td>
- <td>Boundary_Neutral</td>
- <td>most format characters, control codes, or noncharacters</td>
- </tr>
- <tr class="lightblue">
- <td colspan="3" align="center">Neutral Types</td>
- </tr>
- <tr>
- <td>B</td>
- <td>Paragraph_Separator</td>
- <td>various newline characters</td>
- </tr>
- <tr>
- <td>S</td>
- <td>Segment_Separator</td>
- <td>various segment-related control codes</td>
- </tr>
- <tr>
- <td>WS</td>
- <td>White_Space</td>
- <td>spaces</td>
- </tr>
- <tr>
- <td>ON</td>
- <td>Other_Neutral</td>
- <td>most other symbols and punctuation marks</td>
- </tr>
- <tr class="lightblue">
- <td colspan="3" align="center">Explicit Formatting Types</td>
- </tr>
- <tr>
- <td>LRE</td>
- <td>Left_To_Right_Embedding</td>
- <td>U+202A: the LR embedding control</td>
- </tr>
- <tr>
- <td>LRO</td>
- <td>Left_To_Right_Override</td>
- <td>U+202D: the LR override control</td>
- </tr>
- <tr>
- <td>RLE</td>
- <td>Right_To_Left_Embedding</td>
- <td>U+202B: the RL embedding control</td>
- </tr>
- <tr>
- <td>RLO</td>
- <td>Right_To_Left_Override</td>
- <td>U+202E: the RL override control</td>
- </tr>
- <tr>
- <td>PDF</td>
- <td>Pop_Directional_Format</td>
- <td>U+202C: terminates an embedding or override control</td>
- </tr>
- <tr>
- <td>LRI</td>
- <td>Left_To_Right_Isolate</td>
- <td>U+2066: the LR isolate control</td>
- </tr>
- <tr>
- <td>RLI</td>
- <td>Right_To_Left_Isolate</td>
- <td>U+2067: the RL isolate control</td>
- </tr>
- <tr>
- <td>FSI</td>
- <td>First_Strong_Isolate</td>
- <td>U+2068: the first strong isolate control</td>
- </tr>
- <tr>
- <td>PDI</td>
- <td>Pop_Directional_Isolate</td>
- <td>U+2069: terminates an isolate control</td>
- </tr>
- </table>
- </div>
-
- <p>Please refer to Unicode Standard Annex #9, "Unicode Bidirectional Algorithm"
- [<a href="../tr41/tr41-21.html#UAX9">UAX9</a>] for
- an an explanation of the significance
- of these values when formatting bidirectional text.</p>
-
- <p>The four enumerated values for the isolate controls were added
- in Unicode 6.3. That means there is a discontinuity in the enumeration for Bidi_Class
- between Unicode 6.2 and Unicode 6.3 (and later versions) which parsers of
- UnicodeData.txt and DerivedBidiClass.txt must take into account.</p>
-
- <h4>5.7.3 <a name="Character_Decomposition_Mappings" href="#Character_Decomposition_Mappings">Character Decomposition Mapping</a></h4>
- <p>The value of the Decomposition_Mapping property for a character is provided
- in field 5 of UnicodeData.txt. This is a string property, consisting of a sequence
- of one or more Unicode code points. The default value of the Decomposition_Mapping
- property is the code point of the character itself. The use of the default value
- for a character is indicated by leaving field 5 empty in UnicodeData.txt.
- Informally, the value of the Decomposition_Mapping property for a character
- is known simply as its <i>decomposition mapping</i>. When a character's decomposition
- mapping is other than the default value, the decomposition mapping is printed out
- explicitly in the names list for the Unicode code charts.</p>
-
- <p>The prefixed tags supplied with a subset of the decomposition mappings generally indicate formatting
- information. Where no such tag is given, the mapping is canonical. Conversely, the presence of a
- formatting tag also indicates that the mapping is a compatibility mapping and not a canonical
- mapping. In the absence of other formatting information in a compatibility mapping, the tag is
- used to distinguish it from canonical mappings.</p>
-
- <p>In some instances a canonical mapping or a compatibility mapping may consist of a single
- character. For a canonical mapping, this indicates that the character is a canonical equivalent of
- another single character. For a compatibility mapping, this indicates that the character is a
- compatibility equivalent of another single character.</p>
-
- <p>A canonical mapping may also consist of a pair of characters, but is never
- longer than two characters. When a canonical mapping consists of a pair of characters,
- the first character may itself be a character with a decomposition mapping, but the
- second character never has a decomposition mapping.</p>
-
- <p>Compatibility mappings can be much longer than canonical mappings. For historical reasons, the
- longest compatibility mapping is 18 characters long. Compatibility mappings are guaranteed
- to be no longer than 18 characters, although most consist of just a few characters.</p>
-
- <p>The compatibility formatting
- tags used in the UCD are listed in <i>Table 14</i>.</p>
-
- <p class="caption">Table 14. <a name="Formatting_Tags_Table" href="#Formatting_Tags_Table">Compatibility Formatting Tags</a></p>
- <div align="center">
- <table class="simple">
- <tr>
- <th>Tag</th>
- <th>Description</th>
- </tr>
- <tr>
- <td><font></td>
- <td>Font variant (for example, a blackletter form)</td>
- </tr>
- <tr>
- <td><noBreak></td>
- <td>No-break version of a space or hyphen</td>
- </tr>
- <tr>
- <td><initial></td>
- <td>Initial presentation form (Arabic)</td>
- </tr>
- <tr>
- <td><medial></td>
- <td>Medial presentation form (Arabic)</td>
- </tr>
- <tr>
- <td><final></td>
- <td>Final presentation form (Arabic)</td>
- </tr>
- <tr>
- <td><isolated></td>
- <td>Isolated presentation form (Arabic)</td>
- </tr>
- <tr>
- <td><circle></td>
- <td>Encircled form</td>
- </tr>
- <tr>
- <td><super></td>
- <td>Superscript form</td>
- </tr>
- <tr>
- <td><sub></td>
- <td>Subscript form</td>
- </tr>
- <tr>
- <td><vertical></td>
- <td>Vertical layout presentation form</td>
- </tr>
- <tr>
- <td><wide></td>
- <td>Wide (or zenkaku) compatibility character</td>
- </tr>
- <tr>
- <td><narrow></td>
- <td>Narrow (or hankaku) compatibility character</td>
- </tr>
- <tr>
- <td><small></td>
- <td>Small variant form (CNS compatibility)</td>
- </tr>
- <tr>
- <td><square></td>
- <td>CJK squared font variant</td>
- </tr>
- <tr>
- <td><fraction></td>
- <td>Vulgar fraction form</td>
- </tr>
- <tr>
- <td><compat></td>
- <td>Otherwise unspecified compatibility character</td>
- </tr>
- </table>
- </div>
-
- <p><b>Note: </b>There is a difference between decomposition and the
- Decomposition_Mapping property. The
- Decomposition_Mapping property is a string property whose
- values (mappings) are defined in UnicodeData.txt, while the decomposition (also termed "full
- decomposition") is defined in <i>Section 3.7, Decomposition</i> in
- [<a href="../tr41/tr41-21.html#Unicode">Unicode</a>] to use those mappings <i>recursively.</i></p>
-
- <ul>
- <li>The canonical decomposition is formed by recursively applying the canonical mappings, then
- applying the Canonical Ordering Algorithm.</li>
- <li>The compatibility decomposition is formed by recursively applying the canonical <b>and</b>
- compatibility mappings, then applying the Canonical Ordering Algorithm.</li>
- </ul>
-
- <p>Starting from Unicode 2.1.9, the decomposition mappings in
- <a href="#UnicodeData.txt">UnicodeData.txt</a> can be used to derive the
- full decomposition of any single character in canonical order, without
- the need to separately apply the Canonical Ordering Algorithm.
- However, canonical ordering of combining character sequences <b><i>must</i></b> still be applied
- in decomposition when normalizing source text which contains any combining marks.</p>
-
- <p>The normalization of Hangul conjoining jamos and of Hangul syllables depends on algorithmic
- mapping, as specified in <i>Section 3.12, Conjoining Jamo Behavior</i> in
- [<a href="../tr41/tr41-21.html#Unicode">Unicode</a>].
- That algorithm specifies the full decomposition of all precomposed Hangul syllables, but
- effectively it is equivalent to the recursive application of pairwise decomposition
- mappings, as for all other Unicode characters. Formally, the Decomposition_Mapping
- property value for a Hangul syllable is the pairwise decomposition and not the full
- decomposition.</p>
-
- <p>Each character with the <a href="#Hangul_Syllable_Type">Hangul_Syllable_Type</a>
- value LVT will have a Decomposition_Mapping consisting of a character with an LV value and a
- character with a T value. Thus for U+CE31 the Decomposition_Mapping is <U+CE20, U+11B8>,
- rather than <U+110E, U+1173, U+11B8>.</p>
-
- <p>The Unihan property kCompatibilityVariant consists of a listing of the
- canonical Decomposition_Mapping property values just for CJK compatibility ideographs. Because its values are
- derived from UnicodeData.txt, it is formally considered to be a derived property. The exact statement
- of the derivation for kCompatibilityVariant is listed in Unicode Standard Annex #38, "Unicode Han Database (Unihan)"
- [<a href="../tr41/tr41-21.html#UAX38">UAX38</a>].</p>
-
- <h4>5.7.4 <a name="Canonical_Combining_Class_Values" href="#Canonical_Combining_Class_Values">Canonical Combining Class Values</a></h4>
-
- <p>The values in the Canonical_Combining_Class field in UnicodeData.txt
- are numerical values used in the Canonical Ordering Algorithm. Some of
- those numerical values also have explicit symbolic labels as property
- value aliases, to make their intended application more understandable.
- For convenience in reference, <i>Table 15</i>
- lists the long symbolic aliases for Canonical_Combining_Class values, reproduced from
- <a href="#Property_Aliases">PropertyValueAliases.txt</a>,
- along with a brief description of each category. The listing for
- fixed position classes, with long symbolic aliases of the form "Ccc10", and so forth, is
- abbreviated, as when those labels occur they are predictable in form, based on the numeric values.</p>
- <p class="caption">Table 15. <a name="CCC_Values_Table" href="#CCC_Values_Table">Canonical_Combining_Class Values</a></p>
- <div align="center">
- <table class="simple">
- <tr>
- <th>Value</th>
- <th>Long</th>
- <th>Description</th>
- </tr>
- <tr>
- <td>0</td>
- <td>Not_Reordered</td>
- <td>Spacing and enclosing marks; also many vowel and consonant signs, even if nonspacing</td>
- </tr>
- <tr>
- <td>1</td>
- <td>Overlay</td>
- <td>Marks which overlay a base letter or symbol</td>
- </tr>
- <tr>
- <td>7</td>
- <td>Nukta</td>
- <td>Diacritic nukta marks in Brahmi-derived scripts</td>
- </tr>
- <tr>
- <td>8</td>
- <td>Kana_Voicing</td>
- <td>Hiragana/Katakana voicing marks</td>
- </tr>
- <tr>
- <td>9</td>
- <td>Virama</td>
- <td>Viramas</td>
- </tr>
- <tr>
- <td>10</td>
- <td>Ccc10</td>
- <td>Start of fixed position classes</td>
- </tr>
- <tr>
- <td>...</td>
- <td>...</td>
- <td> </td>
- </tr>
- <tr>
- <td>199</td>
- <td> </td>
- <td>End of fixed position classes</td>
- </tr>
- <tr>
- <td>200</td>
- <td>Attached_Below_Left</td>
- <td>Marks attached at the bottom left</td>
- </tr>
- <tr>
- <td>202</td>
- <td>Attached_Below</td>
- <td>Marks attached directly below</td>
- </tr>
- <tr>
- <td>204</td>
- <td> </td>
- <td>Marks attached at the bottom right</td>
- </tr>
- <tr>
- <td>208</td>
- <td> </td>
- <td>Marks attached to the left</td>
- </tr>
- <tr>
- <td>210</td>
- <td> </td>
- <td>Marks attached to the right</td>
- </tr>
- <tr>
- <td>212</td>
- <td> </td>
- <td>Marks attached at the top left</td>
- </tr>
- <tr>
- <td>214</td>
- <td>Attached_Above</td>
- <td>Marks attached directly above</td>
- </tr>
- <tr>
- <td>216</td>
- <td>Attached_Above_Right</td>
- <td>Marks attached at the top right</td>
- </tr>
- <tr>
- <td>218</td>
- <td>Below_Left</td>
- <td>Distinct marks at the bottom left</td>
- </tr>
- <tr>
- <td>220</td>
- <td>Below</td>
- <td>Distinct marks directly below</td>
- </tr>
- <tr>
- <td>222</td>
- <td>Below_Right</td>
- <td>Distinct marks at the bottom right</td>
- </tr>
- <tr>
- <td>224</td>
- <td>Left</td>
- <td>Distinct marks to the left</td>
- </tr>
- <tr>
- <td>226</td>
- <td>Right</td>
- <td>Distinct marks to the right</td>
- </tr>
- <tr>
- <td>228</td>
- <td>Above_Left</td>
- <td>Distinct marks at the top left</td>
- </tr>
- <tr>
- <td>230</td>
- <td>Above</td>
- <td>Distinct marks directly above</td>
- </tr>
- <tr>
- <td>232</td>
- <td>Above_Right</td>
- <td>Distinct marks at the top right</td>
- </tr>
- <tr>
- <td>233</td>
- <td>Double_Below</td>
- <td>Distinct marks subtending two bases</td>
- </tr>
- <tr>
- <td>234</td>
- <td>Double_Above</td>
- <td>Distinct marks extending above two bases</td>
- </tr>
- <tr>
- <td>240</td>
- <td>Iota_Subscript</td>
- <td>Greek iota subscript only</td>
- </tr>
- </table>
- </div>
-
- <p>Some of the Canonical_Combining_Class values in the table are not currently used
- for any characters but are specified here for completeness. Some
- values do not have long symbolic aliases and are not listed in PropertyValueAliases.txt.
- Do not assume that absence of a long symbolic alias implies
- non-use of a particular Canonical_Combining_Class. See
- <a href="#DerivedCombiningClass.txt">DerivedCombiningClass.txt</a> for
- a complete listing of the use of Canonical_Combining_Class values for
- any particular version of the UCD.</p>
-
- <p>For use in regular expression matching, fixed position classes (ccc=10 through
- ccc=199) which actually occur in the Unicode Character Database for any version are
- given predictable aliases of the form "Ccc10", "Ccc11", and so forth. The complete list of such aliases which
- are actually defined can be found in PropertyValueAliases.txt.</p>
-
- <p>The character property invariants regarding Canonical_Combining_Class
- guarantee that values, once assigned, will never change, and
- that all values used will be in the range 0..254. See
- <a href="#Invariants_in_Implementations">Invariants in Implementations</a>.</p>
-
- <p>Combining marks with ccc=224 (Left) follow their base character in storage,
- as for all combining marks, but are rendered visually on the left
- side of them. For all past versions of the UCD and
- continuing with this version of the UCD, only two
- tone marks used in certain notations for Hangul syllables have ccc=224.
- Those marks are actually rendered visually on the left side of
- the preceding <i>grapheme cluster</i>, in the case of Hangul syllables
- resulting from sequences of conjoining jamos.</p>
-
- <p>Those few instances of combining marks with ccc=Left should be
- distinguished from the far more numerous examples of left-side vowel
- signs and vowel letters in Brahmi-derived scripts.
- The Canonical_Combining_Class value is zero (Not_Reordered) for both
- ordinary, left-side (reordrant) vowel signs such as
- U+093F DEVANAGARI VOWEL SIGN I and for Thai-style left-side
- (Logical_Order_Exception=Yes) vowel letters such as U+0E40
- THAI CHARACTER SARA E. The "Not_Reordered" of ccc=Not_Reordered
- refers to the behavior of the character in terms of the Canonical
- Ordering Algorithm as part of the definition of Unicode Normalization;
- it does <i>not</i> refer to any issues of visual reordering of glyphs
- involved in display and rendering. See "Canonical Ordering
- Algorithm" in <i>Section 3.11,
- Normalization Forms</i> in [<a href="../tr41/tr41-21.html#Unicode">Unicode</a>].</p>
-
-
- <h4>5.7.5 <a name="Decompositions_and_Normalization" href="#Decompositions_and_Normalization">Decompositions and Normalization</a></h4>
-
- <p>Decomposition is specified in <i>Chapter 3, Conformance</i> of
- [<a href="../tr41/tr41-21.html#Unicode">Unicode</a>].
- That chapter also
- specifies the interaction between decomposition and normalization.</p>
-
- <p>A number of derived properties related to Unicode normalization are called
- the "Quick_Check" properties. These are defined to enable various optimizations
- for implementations of normalization, as explained in
- <i>Section 9, Detecting Normalization Forms</i>, in Unicode Standard Annex #15, "Unicode Normalization Forms"
- [<a href="../tr41/tr41-21.html#UAX15">UAX15</a>].
- The values for the four Quick_Check properties for all code points are listed in
- DerivedNormalizationProps.txt. The interpretations of the possible property values
- are summarized in <i>Table 16</i>.</p>
-
- <p class="caption">Table 16. <a name="QC_Values_Table" href="#QC_Values_Table">Quick_Check Property Values</a></p>
- <div align="center">
- <table class="simple">
- <tr>
- <th>Property</th>
- <th>Value</th>
- <th>Description</th>
- </tr>
- <tr>
- <td>NFC_QC, NFKC_QC, NFD_QC, NFKD_QC</td>
- <td>No</td>
- <td>Characters that cannot ever occur in the respective normalization form.</td>
- </tr>
- <tr>
- <td>NFC_QC, NFKC_QC</td>
- <td>Maybe</td>
- <td>Characters that may occur in the respective normalization, depending on the context.</td>
- </tr>
- <tr>
- <td>NFC_QC, NFKC_QC, NFD_QC, NFKD_QC</td>
- <td>Yes</td>
- <td>All other characters. This is the default value for Quick_Check properties.</td>
- </tr>
- </table>
- </div>
- <p>The Quick_Check property values are recommended for exposure in a public library API
- which supports Unicode character properties, because they can be used to optimize
- code that needs to normalize Unicode strings. They enable fast checking of whether
- some input strings are already in the desired normalization form. This may make
- it possible to bypass
- the more time-consuming call to run the complete Unicode Normalization Algorithm
- on the input string.</p>
- <p>In contrast, some normalization-related Unicode character properties
- are <i>not</i> recommended for exposure in a public library API. Notably, these include
- <a href="#Decomposition_Mapping">Decomposition_Mapping</a>,
- <a href="#Composition_Exclusion">Composition_Exclusion</a>,
- and the derived <a href="#Full_Composition_Exclusion">Full_Composition_Exclusion</a>.
- These properties are only used internally in a conformant implementation of
- the Unicode Normalization Algorithm. Exposing them in a public API can lead
- to confusion by users of the API. In particular, Decomposition_Mapping is very
- easy to misinterpret as designating the <i>decomposition</i> of a character,
- also known as the character's <i>full decomposition</i>. See Definitions D62 and D64
- in <i>Section 3.7, Decomposition</i> in [<a href="../tr41/tr41-21.html#Unicode">Unicode</a>].</p>
-
- <h4>5.7.6 <a name="Property_Values_As_Sets" href="#Property_Values_As_Sets">Properties Whose Values Are Sets of Values</a></h4>
-
- <p>Most properties have a single value associated with each code point.
- However, some properties may instead associate a set of multiple
- different values with each code point. For example, the provisional
- kCantonese property, which lists Cantonese pronunciations
- for unified CJK ideographs, has values which consist of a set of
- zero or more romanized pronunciation strings. Thus, the Unihan
- Database contains an entry:</p>
- <blockquote>
- <pre>
- U+342B kCantonese gun3 hung1 zung1
- </pre>
- </blockquote>
- <p>This line is to be interpreted as associating a set of three string values,
- {"gun3", "hung1", "zung1"} with the kCantonese property for U+342B.</p>
- <p>Similarly, the Script_Extensions property has values which
- consist of a set of one or more Script property values. Thus the
- property file ScriptExtensions.txt in the UCD contains an entry:</p>
- <blockquote>
- <pre>
- 0640 ; Adlm Arab Mand Mani Phlp Syrc # Lm ARABIC TATWEEL
- </pre>
- </blockquote>
- <p>This line is to be interpreted as associating a set of six enumerated
- Script property values, {Adlm, Arab, Mand, Mani, Phlp, Syrc}, with the Script_Extensions
- property for U+0640.</p>
- <p>In the case of Script_Extensions, in particular, the set of sets which
- constitute meaningful values of the property is relatively small, and could be explicitly
- evaluated for any particular Unicode version. For example:</p>
- <blockquote>
- <pre>
- {{Adlm, Arab, Mand, Mani, Phlp, Syrc}, {Arab, Copt}, {Arab, Syrc}, {Arab, Thaa}, {Arab, Syrc, Thaa}, {Armn, Geor}, ...}
- </pre>
- </blockquote>
- <p>However, an enumeration of this set of set values is unlikely to be
- of much implementation value, and would be likely to change significantly between
- versions of the standard. In other cases, such as for properties definining pronunciation
- readings for unified CJK ideographs, these sets of sets are completely open-ended, and there
- is no point to attempting to provide explicit enumerations of such sets in the UCD.</p>
- <p>The order of the element values in such sets may or may not be significant.
- For example, the order among the element values for kCantonese and for
- Script_Extensions is not significant. By way of contrast, when the kMandarin
- property shows two values for a code point, the first value is used to
- indicate a preferred pronunciation for zh-Hans (CN) and the second a
- preferred pronunciation for zh-Hant (TW).</p>
- <p>For data file format considerations regarding properties which take
- sets of values, see Section 4.2.8 <a href="#Multiple_Values">Multiple Values for Properties</a>.
- For considerations regarding validation of such
- properties, see Section 5.11.5 <a href="#Validation_of_Multivalued">Validation of Multivalued Properties</a>.
- See also Unicode Technical Standard #18, "Unicode Regular Expressions"
- [<a href="../tr41/tr41-21.html#UTS18">UTS18</a>] for a discussion of how to handle
- such properties when processing regular expressions.</p>
- <h3>5.8 <a name="Property_And_Value_Aliases" href="#Property_And_Value_Aliases">Property and Property Value Aliases</a></h3>
- <p>Both Unicode character properties themselves and their values are
- given symbolic aliases. The formal lists of aliases are provided so that
- well-defined symbolic values are available for XML formats of the UCD
- data, for regular expression property tests, and for other
- programmatic textual descriptions of Unicode data.
- The aliases for properties are defined in
- PropertyAliases.txt. The aliases for property values are defined in
- PropertyValueAliases.txt.</p>
-
- <p class="caption">Table 17. <a name="Alias_Files_Table" href="#Alias_Files_Table">Alias Files in the UCD</a></p>
- <div align="center">
- <table class="simple">
- <tr>
- <th>File Name</th>
- <th>Status</th>
- <th>Description</th>
- </tr>
- <tr>
- <td><a name="PropertyAliases.txt" href="#PropertyAliases.txt">PropertyAliases.txt</a></td>
- <td>N</td>
- <td>Names and abbreviations for properties</td>
- </tr>
- <tr>
- <td><a name="PropertyValueAliases.txt" href="#PropertyValueAliases.txt">PropertyValueAliases.txt</a></td>
- <td>N</td>
- <td>Names and abbreviations for property values</td>
- </tr>
- </table>
- </div>
-
- <p>Aliases are defined as ASCII-compatible identifiers, using only uppercase or
- lowercase A-Z, digits, and underscore "_". Case is not significant
- when comparing aliases, but the preferred form used in the data files
- for longer aliases is to titlecase them.</p>
-
- <p>Aliases may be translated in appropriate environments, and additional
- aliases may be useful in certain contexts. There is no requirement that
- only the aliases defined in the alias files of the UCD be used when
- referring to Unicode character properties or their values; however, their
- use is recommended for interoperability in data formats or in
- programmatic contexts.</p>
-
- <p>Aliases may be provided
- for provisional properties. There are stability guarantees for property aliases and property
- value aliases, but no stability guarantees for provisional properties or other
- provisional data files; consequently, there can also be
- no stability guarantee for property aliases or property value aliases associated with provisional
- properties.</p>
-
- <h4>5.8.1 <a name="Property_Aliases" href="#Property_Aliases">Property Aliases</a></h4>
-
- <p>In PropertyAliases.txt, the first field specifies an abbreviated
- symbolic name for the property, and the second field specifies the
- long symbolic name for the property. These are the preferred aliases.
- Additional aliases for a few properties are specified in the third
- or subsequent fields.</p>
-
- <p>Aliases for normative and informative
- properties defined in the Unihan data files are included in PropertyAliases.txt,
- beginning with Version 5.2.</p>
-
- <p>The long symbolic name alias is self-descriptive, and is
- treated as the official name of
- a Unicode character property. For clarity it is used whenever possible
- when referring to that
- property in this annex and elsewhere in the Unicode Standard.
- For example: "The Line_Break property is discussed in Unicode Standard Annex #14, "Unicode Line
- Breaking Algorithm" [<a href="../tr41/tr41-21.html#UAX14">UAX14</a>]."</p>
-
- <p>The abbreviated symbolic name alias is short and less mnemonic,
- but is useful for expressions such as "lb=BA" in data or in other
- contexts where the meaning is clear.</p>
-
- <p>The property aliases specified in PropertyAliases.txt constitute
- a unique namespace. When using these symbolic values, no
- alias for one property will match an alias for another property.</p>
-
- <h4>5.8.2 <a name="Property_Value_Aliases" href="#Property_Value_Aliases">Property Value Aliases</a></h4>
-
- <p>In PropertyValueAliases.txt, the first field contains the
- abbreviated alias for a Unicode property, the second field specifies
- an abbreviated symbolic name for a value of that property, and
- the third field specifies the
- long symbolic name for that value of that property. These are the
- preferred aliases.
- Additional aliases for some property values may be specified in the fourth
- or subsequent fields. For example, for binary properties, the
- abbreviated alias for the True value is "Y", and the long alias
- is "Yes", but each entry also specifies "T" and "True" as
- additional aliases for that value, as shown in <i>Table 18</i>.</p>
-
- <p class="caption">Table 18. <a name="Binary_Values_Table" href="#Binary_Values_Table">Binary Property Value Aliases</a></p>
- <div align="center">
- <table class="simple">
- <tr>
- <th>Long</th>
- <th>Abbreviated</th>
- <th>Other Aliases</th>
- </tr>
- <tr>
- <td style="text-align:center">Yes</td>
- <td style="text-align:center">Y</td>
- <td style="text-align:center">True, T</td>
- </tr>
- <tr>
- <td style="text-align:center">No</td>
- <td style="text-align:center">N</td>
- <td style="text-align:center">False, F</td>
- </tr>
- </table>
-
- </div>
- <p>Not every property value has an associated alias. Property value
- aliases are typically supplied for catalog and enumeration
- properties, which have well-defined, enumerated values. It does not
- make sense to specify property value aliases, for example, for
- the Numeric_Value property, whose value could be any number, or
- for a string property such as Simple_Lowercase_Mapping, whose values
- are mappings from one code point to another.</p>
-
- <p>The Canonical_Combining_Class property requires special handling
- in PropertyValueAliases.txt. The values of this property are numeric,
- but they comprise a closed, enumerated set of values. The more
- important of those values are given symbolic name aliases.
- In PropertyValueAliases.txt, the second field provides the numeric
- value, while the third field contains the abbreviated symbolic
- name alias and the fourth field contains the long symbolic
- name alias for that numeric value. For example:</p>
-
- <blockquote>
- <pre>
- ccc; 230; A ; Above
- ccc; 232; AR ; Above_Right
- </pre>
- </blockquote>
-
- <p>Taken by themselves, property value aliases do not constitute
- a unique namespace. The abbreviated aliases, in particular,
- are often re-used as aliases for values for different properties.
- All of the binary property value aliases, for example, make
- use of the same "Y", "Yes", "T", "True" symbols. Property value
- aliases may also overlap the symbols used for property aliases.
- For example, "Sc" is the abbreviated alias for the
- "Currency_Symbol" value of the General_Category property, but
- it is also the abbreviated alias for the Script property.
- However, the aliases for values for any single property are
- always unique within the context of that property. That
- means that expressions that combine a property alias and
- a property value alias, such as "lb=BA" or "gc=Sc" <i>always</i>
- refer unambiguously just to one value of one given property,
- and will not match any other value of any other property.</p>
-
- <p>Prior to Version 6.1.0, the property value alias entries for three properties,
- Age, Block, and Joining_Group, made use of a special metavalue
- "n/a" in the field for the abbreviated alias. This should
- be understood as meaning that no abbreviated alias was
- defined for that value for that property, rather than as
- an alias per se. Starting with Version 6.1.0, all property values for those
- three properties have abbreviated aliases, so there is no current use of the "n/a" metavalue.</p>
-
- <p>In a few cases, because of longstanding legacy practice
- in referring to values of a property by short identifiers,
- the abbreviated alias and the long alias are the same. This
- can be seen, for example, in some property value aliases
- for the Line_Break property and the Grapheme_Cluster_Break
- property.</p>
-
- <p>The property <a href="#Script_Extensions">Script_Extensions</a>
- consists of enumerated sets of Script property values. The set of those sets is potentially
- open-ended, and no property value aliases are defined for them.</p>
-
- <h3>5.9 <a name="Matching_Rules" href="#Matching_Rules">Matching Rules</a></h3>
- <p>When matching Unicode character property names
- and values, it is strongly recommended that all
- <a href="#Property_Aliases">Property and Property Value Aliases</a>
- be recognized. For best results in matching, rather than using
- exact binary comparisons, the following loose matching rules
- should be observed.</p>
- <h4>5.9.1 <a name="Matching_Numeric" href="#Matching_Numeric">Matching Numeric Property Values</a></h4>
- <p>For all numeric properties, and for properties such as Unicode_Radical_Stroke
- which are constructed from combinations
- of numeric values, use loose matching rule UAX44-LM1 when comparing property values.</p>
-
- <p><i><b><a name="UAX44-LM1" href="#UAX44-LM1">UAX44-LM1</a>.</b></i> Apply numeric equivalences.</p>
- <ul>
- <li>"01.00" is equivalent to "1".</li>
- <li>"1.666667" in the UCD is a repeating fraction, and
- equivalent to "10/6" or "5/3".</li>
- </ul>
-
- <h4>5.9.2 <a name="Matching_Names" href="#Matching_Names">Matching Character Names</a></h4>
- <p>Unicode character names constitute a special case. Formally, they are values
- of the Name property. While each Unicode character name for an assigned character
- is guaranteed to be unique, names are assigned in such a way that
- the presence or absence of spaces cannot be used to distinguish them.
- Furthermore, implementations sometimes create identifiers from Unicode
- character names by inserting underscores for spaces. For best results
- in comparing Unicode character names, use loose matching rule UAX44-LM2.</p>
-
- <p><i><b><a name="UAX44-LM2" href="#UAX44-LM2">UAX44-LM2</a>.</b></i> Ignore case, whitespace, underscore ('_'), and all medial hyphens except the hyphen in
- U+1180 HANGUL JUNGSEONG O-E.</p>
- <ul>
- <li>"zero-width space" is equivalent to "ZERO WIDTH SPACE" or "zerowidthspace"</li>
- <li>"character -a" is <i>not</i> equivalent to "character a"</li>
- </ul>
-
- <p>In this rule "medial hyphen" is to be construed as a hyphen
- occurring immediately between two letters in the normative Unicode character
- name, as published in the Unicode names list, and not to any hyphen that may
- transiently occur medially as a result of removing whitespace before removing hyphens in
- a particular implementation of matching. Thus the hyphen in the name
- U+10089 LINEAR B IDEOGRAM B107M HE-GOAT is medial, and should be ignored
- in loose matching, but the hyphen in the name U+0F39 TIBETAN MARK TSA -PHRU is
- <i>not</i> medial, and should not be ignored in loose matching.</p>
-
- <p>An implementation of this loose matching rule can obtain
- the correct results when comparing two strings by doing the following three
- operations, in order:</p>
-
- <ol>
- <li>remove all medial hyphens (except the medial hyphen in the name for U+1180)</li>
- <li>remove all whitespace and underscore characters</li>
- <li>apply toLowercase() to both strings</li>
- </ol>
-
- <p>After applying these three operations, if the two strings
- compare binary equal, then they are considered to match.</p>
-
- <p>This is a logical statement of how the rule works. If programmed
- carefully, an implementation of the matching rule can transform the strings in
- a single pass. It is also possible to compare two name strings for loose matching
- while transforming each string incrementally.</p>
-
- <p>Loose matching rule UAX44-LM2 is also appropriate for matching
- character name aliases and the names of named character sequences, which share the
- namespace (and matching behavior) of Unicode character names. See <i>Section 4.8, Name</i> in
- [<a href="../tr41/tr41-21.html#Unicode">Unicode</a>]</p>
-
- <p>Implementations of name matching should use extreme care when matching
- non-standard, alternative names for particular characters. The Name Uniqueness Policy
- in the Unicode Consortium Stability
- Policies [<a href="../tr41/tr41-21.html#Stability">Stability</a>] guarantees that
- the Unicode Standard will never add a character whose name would match an existing
- encoded character, according to matching rule UAX44-LM2. However, any <i>other</i>
- name for a character might be used in the future.</p>
-
- <p>The following is a concrete example of the kind of trouble that can occur.
- Prior to Unicode 6.0 some implementations of regex allowed matching of the name "BELL" for
- the control code U+0007. When Unicode 6.0 added a <i>different</i> encoded character,
- U+1F514 BELL for emoji symbols, those regex implementations broke.</p>
-
- <p>As of Version 6.1 of the Unicode Standard, the most commonly occurring
- alternative names for control codes, as well as many commonly used abbreviations for
- Unicode format characters, have been added as character name aliases. This automatically
- excludes all such alternative names and abbreviations from the potential pool for
- future Unicode character names, because name uniqueness is defined over the namespace
- which includes both character names and character name aliases. That exclusion should
- reduce the potential for surprises similar to the "BELL" case, where implementers
- assume that a name for a control code is already well-defined.</p>
-
- <h4>5.9.3 <a name="Matching_Symbolic" href="#Matching_Symbolic">Matching Symbolic Values</a></h4>
- <p>Property aliases and property value aliases are symbolic values. When
- comparing them, use loose matching rule UAX44-LM3.</p>
-
- <p><i><b><a name="UAX44-LM3" href="#UAX44-LM3">UAX44-LM3</a>.</b></i> Ignore case, whitespace, underscore ('_'),
- hyphens, and any initial prefix string "is".</p>
- <ul>
- <li>"linebreak" is equivalent to "Line_Break" or "Line-break"</li>
- <li>"lb=BA" is equivalent to "lb=ba" or "LB=BA"</li>
- <li>"Script=Greek" is equivalent to "Script=isGreek" or "Script=Is_Greek"</li>
- </ul>
- <p>Loose matching is generally appropriate for the property values of
- Catalog, Enumeration, and Binary properties, which have symbolic aliases
- defined for their values.
- Loose matching should not be done for the property values of String properties,
- which do not have symbolic aliases defined for their values; exact
- matching for String property values is important, as
- case distinctions or other distinctions in those values may be significant.</p>
-
- <p>For loose matching of symbolic values, an initial prefix string "is" is
- ignored. The reason for this is that APIs returning property values are often
- named using the convention of prefixing "is" (or "Is" or "Is_", and so forth) to
- a property value. Ignoring any initial "is" on a symbolic value during loose
- matching is likely to produce the best results in application areas such as
- regex. Removal of an initial "is" string for a loose matching comparison only
- needs to be done once for a symbolic value, and need not be tested recursively.
- There are no property aliases or property value aliases of the form
- "isisisisistooconvoluted" defined just to test implementation edge cases.</p>
-
- <p>Existing and future property aliases and property value
- aliases are guaranteed to be unique within their relevant namespaces, even
- if an initial prefix string "is" is ignored. The existing cases of note
- for aliases that do start with "is" are: dt=Iso (Decomposition_Type=Isolated)
- and lb=IS. The Decomposition_Type value alias does not cause any problem,
- because there is no contrasting value alias dt=o (Decomposition_Type=olated).
- For lb=IS, note that the "IS" is the <i>entire</i> property value alias, and
- is not a prefix. There is no null value for the Line_Break property for it
- to contrast with, but implementations of loose matching should be careful
- of this edge case, so that "lb=IS" is not misinterpreted as matching a null
- value.</p>
-
- <p>Implementations sometimes use other syntactic constructs
- that interact with loose matching. For example, the property matching
- expression \p{L} may be defaulted to refer to the Unicode General_Category
- property: \p{General_Category=L}. For more information about
- the use of property values in regular expressions and other environments,
- see <i>Section 1.2, Properties</i>, in Unicode Technical Standard #18,
- "Unicode Regular Expressions" [<a href="../tr41/tr41-21.html#UTS18">UTS18</a>].</p>
- <h3>5.10 <a name="Invariants" href="#Invariants">Invariants</a></h3>
- <p>Property values in the UCD may be subject to correction
- in subsequent versions of the standard, as errors are found. Furthermore, any
- new version of the Unicode Standard may introduce new property values for
- a given property, except where the set of allowable values is fixed
- by the property type (such as for binary properties), or where the
- set of allowable values is subject to a provision of the Unicode
- Character Encoding Stability Policy [<a href="../tr41/tr41-21.html#Stability">Stability</a>].
- Finally, a new version may also
- introduce new properties or new data files in the UCD.</p>
-
- <p>Implementers of the UCD need to be aware of
- such changes when updating to new versions. However, some property values
- and some aspects of the file formats are considered
- invariant. This section documents such invariants.</p>
-
- <h4>5.10.1 <a name="Property_Invariants" href="#Property_Invariants">Character Property Invariants</a></h4>
-
- <p>All formally guaranteed invariants for properties or property values
- are described in
- the Unicode Character Encoding Stability Policy
- [<a href="../tr41/tr41-21.html#Stability">Stability</a>].
- That policy and the list of invariants it enumerates are
- maintained outside the context of the Unicode Standard per se.
- They are not part of the standard, but rather are constraints
- on what can and cannot change in the standard between versions,
- and on what decisions the Unicode Technical Committee can and
- cannot take regarding the standard.</p>
-
- <p>In addition to the formally guaranteed invariants described
- in the Unicode Character Encoding Stability Policy, this section
- notes a few additional points regarding character property
- invariants in the UCD.</p>
-
- <p>Some character properties are simply considered <i>immutable</i>: once
- assigned, they are never changed. For example, a character's name
- is immutable, because of its importance in exact identification
- of the character. The Canonical_Combining_Class and
- Decomposition_Mapping of a character are immutable, because of their
- importance to the stability of the Unicode Normalization Algorithm
- [<a href="../tr41/tr41-21.html#UAX15">UAX15</a>].</p>
-
- <p>The list of immutable character properties is shown in
- <i>Table 19</i>.</p>
-
- <p class="caption">Table 19. <a name="Immutable_Properties_Table" href="#Immutable_Properties_Table">Immutable Properties</a></p>
- <div align="center">
- <table class="simple">
- <tr>
- <th>Property Name</th>
- <th>Abbr Name</th>
- <th>Default Value</th>
- <th>Assignable to New?</th>
- </tr>
- <tr>
- <td>Age</td>
- <td>Age</td>
- <td>Unassigned</td>
- <td>Yes</td>
- </tr>
- <tr>
- <td>Name</td>
- <td>na</td>
- <td>null string</td>
- <td>Yes</td>
- </tr>
- <tr>
- <td>Name_Alias</td>
- <td>Name_Alias</td>
- <td>null string</td>
- <td>Yes (see note)</td>
- </tr>
- <tr>
- <td>Jamo_Short_Name</td>
- <td>jsn</td>
- <td>null string</td>
- <td>No</td>
- </tr>
- <tr>
- <td>Canonical_Combining_Class</td>
- <td>ccc</td>
- <td>0</td>
- <td>Yes</td>
- </tr>
- <tr>
- <td>Decomposition_Mapping</td>
- <td>dm</td>
- <td><code point></td>
- <td>Yes</td>
- </tr>
- <tr>
- <td>Pattern_Syntax</td>
- <td>Pat_Syn</td>
- <td>No</td>
- <td>No</td>
- </tr>
- <tr>
- <td>Pattern_White_Space</td>
- <td>Pat_WS</td>
- <td>No</td>
- <td>No</td>
- </tr>
- <tr>
- <td>Noncharacter_Code_Point</td>
- <td>NChar</td>
- <td>No</td>
- <td>No</td>
- </tr>
- </table>
- </div>
- <p>If a property has "Yes" in the "Assignable to New?" column
- in <i>Table 19</i>, that means that the property value is immutable once
- it is initially assigned to a newly encoded character. The value for a
- reserved code point takes the default value, as shown
- in the third column of the table, but <i>may change</i> from the default value
- once the character is encoded. On the other hand, if a property has "No"
- in the "Assignable to New?" column, that means that it is <i>absolutely</i>
- immutable: all code points, including reserved code points, have a specific
- property value assigned, and that value does not change if a new character
- is encoded at a particular reserved code point in a future version of the
- standard.</p>
- <p>The Name_Alias property is unusual, in that there can be more
- than one formal name alias assigned to a given encoded character. The default
- value for Name_Alias is the null string, but once any Name_Alias is assigned
- to an encoded character, that value is immutable. If more than one formal
- name alias is assigned to the same encoded character, each of those values is
- immutable.</p>
- <p>A set of binary character properties associated with identifiers have
- a different kind of immutability, which can be described as <i>locked to Yes</i>.
- This results from the way these properties are used in the specification of identifiers.
- Unicode identifiers have the characteristic of stability between versions, so that
- once a string is specified as belonging to a particular class of identifier, it must <i>stay</i>
- in that class for future versions of the standard. Because of that requirement
- for identifier stability, there are associated constraints on
- how the related character properties can change. In particular, the identifier-related properties
- listed in <i>Table 19a</i> may have their values for any particular assigned character
- change from No to Yes between versions of the standard, but once a character has the
- value Yes, that value is locked in, and cannot ever be changed back to No.</p>
-
- <p class="caption">Table 19a. <a name="Yes_Locked_Properties_Table" href="#Yes_Locked_Properties_Table">Yes-Locked Properties</a></p>
- <div align="center">
- <table class="simple">
- <tr>
- <th>Property Name</th>
- <th>Abbr Name</th>
- <th>Default Value</th>
- </tr>
- <tr>
- <td>ID_Start</td>
- <td>IDS</td>
- <td>No</td>
- </tr>
- <tr>
- <td>ID_Continue</td>
- <td>IDC</td>
- <td>No</td>
- </tr>
- <tr>
- <td>XID_Start</td>
- <td>XIDS</td>
- <td>No</td>
- </tr>
- <tr>
- <td>XID_Continue</td>
- <td>XIDC</td>
- <td>No</td>
- </tr>
- </table>
- </div>
- <p>In some cases, a property is not immutable, but the list
- of possible values that it can have is considered
- invariant. For example, while at least some General_Category
- values are subject to change and correction, the enumerated set
- of possible values that the General_Category property can have
- is fixed and cannot be added to in the future. However, not all Enumeration
- properties used by Unicode algorithms have immutable lists of
- property values. For example, the enumerated lists of values
- associated with the Line_Break and the Word_Break properties have
- changed in the past, and may be changed again in future versions
- of the standard.</p>
- <p>All characters other than
- those of General_Category M* are guaranteed to have Canonical_Combining_Class=0.
- Currently it is also true that all characters
- other than those of General_Category Mn have Canonical_Combining_Class=0.
- However, the more constrained statement is not a guaranteed invariant;
- it is possible that some new character of
- General_Category Me or Mc could be given a non-zero value for
- Canonical_Combining_Class in the future.</p>
- <p>In Unicode 4.0 and thereafter, the General_Category value
- <i>Decimal_Number</i> (Nd), and
- the Numeric_Type value <i>Decimal</i> (de) are defined to be co-extensive;
- that is, the set of
- characters having General_Category=Nd will always be the same as the
- set of characters having NumericType=de.</p>
- <h4>5.10.2 <a name="File_Invariants" href="#File_Invariants">UCD File Format Invariants</a></h4>
-
- <p>There are also some constraints on allowable change in the
- file formats for UCD files. In general, the
- <a href="#Format_Conventions">file format conventions</a> are
- changed as little as possible, to minimize the impact on
- implementations which parse the machine-readable data files.
- However, some of the constraints on allowable file format
- change go beyond conservatism in format and instead have
- the status of invariants. These guarantees apply in particular
- to UnicodeData.txt, the very first data file associated with
- the UCD.</p>
-
- <p>The number and order of the fields in UnicodeData.txt is fixed.
- Any additional information about character properties to be added
- to the UCD in the future will
- appear in separate data files, rather than being added as an
- additional field to UnicodeData.txt or by reinterpretation
- of any of the existing fields.</p>
-
- <h4>5.10.3 <a name="Invariants_in_Implementations" href="#Invariants_in_Implementations">Invariants in Implementations</a></h4>
-
- <p>Applications may wish to take the various character property
- and file format
- invariants into account when choosing how to implement character properties.</p>
-
- <p>The Canonical_Combining_Class offers a good example. The
- character property invariants regarding Canonical_Combining_Class
- guarantee that values, once assigned, will never change, and
- that all values used will be in the range 0..254. This means
- that the Canonical_Combining_Class can be safely implemented
- in an unsigned byte and that any value stored in a table for
- an existing character will not need to be updated dynamically
- for a later version.</p>
-
- <p>In practice, for Canonical_Combining_Class far fewer
- than 256 values are used. Unicode 3.0 used 53 values;
- Unicode 3.1 through Unicode 4.1 used 54 values; and Unicode 5.0
- through Unicode 9.0 used 55 values. New, non-zero
- Canonical_Combining_Class values are seldom added to the standard.
- (For details about this history, see
- <a href="#DerivedCombiningClass.txt">DerivedCombiningClass.txt</a>.)
- Implementations may take advantage of this fact for compression,
- because only the ordering of
- the non-zero values, and not their absolute values, matters for
- the Canonical Ordering Algorithm. In principle, it would be
- possible for up to 255 values to be used in the future, but
- the chances of the actual number of values exceeding 128
- are remote at this point. There are implementation advantages
- in restricting the number of internal class values to
- 128—for example, the ability to use signed bytes without
- implicit widening to ints in Java.</p>
-
- <h3>5.11 <a name="Validation" href="#Validation">Validation</a></h3>
-
- <p>The Unicode character
- property values in the UCD files can be validated by means of regular
- expressions. Such validation can also be useful in testing of
- implementations that return property values. The method of validation
- depends on the type of property, as described below.
- These expressions use Perl syntax, but may
- of course be converted to other formal conventions for use
- with other regular expression engines.</p>
-
- <p>The regular expressions which are appropriate for validation
- of particular properties may change in each subsequent version of the UCD.
- However, because of stability guarantees for character property aliases, these
- regular expressions for one version of
- the Unicode Standard will match valid values for previous versions
- of the standard.</p>
-
- <h4>5.11.1 <a name="Validation_of_Enumerated" href="#Validation_of_Enumerated">Enumerated and Binary Properties</a></h4>
-
- <p>Enumerated and binary character properties can be validated by
- generating a regular expression using the PropertyValueAliases.txt file. Because
- enumerated properties have a defined list of possible values, the validating
- regular expression simply ORs together all of the possible values. Binary properties
- are a special case of enumerated property, with a predefined very short
- list of possible values.</p>
-
- <p>For example, to validate the East_Asian_Width property in
- the UCD, or to test an implementation that returns the East_Asian_Width property,
- parse the following relevant lines from PropertyValueAliases.txt and produce a
- regular expression that concatenates each of the short and long property alias
- values.</p>
- <blockquote>
- <pre>
- # East_Asian_Width (ea)
- ea ; A ; Ambiguous
- ea ; F ; Fullwidth
- ea ; H ; Halfwidth
- ea ; N ; Neutral
- ea ; Na ; Narrow
- ea ; W ; Wide
- </pre>
- </blockquote>
-
- <p>The resulting regular expression would then be:</p>
-
- <blockquote>
- <pre>
- /A|Ambiguous|F|Fullwidth|H|Halfwidth|N|Neutral|Na|Narrow|W|Wide/
- </pre>
- </blockquote>
-
- <p>For each Unicode binary character property, the regular
- expression can be precomputed simply as:</p>
-
- <blockquote>
- <pre>
- /N|No|F|False|Y|Yes|T|True/
- </pre>
- </blockquote>
-
- <p>The Catalog properties, Age, Block, and Script, are another
- type of enumerated character property. All possible values of those properties
- for any given version of the Unicode Standard are listed in PropertyValueAliases.txt,
- so a validating regular expression for a Catalog property for that given version of the UCD can be
- generated by concatenating values, as for the other enumerated properties.</p>
-
- <h4>5.11.2 <a name="Validation_of_CCC" href="#Validation_of_CCC">Combining_Character_Class Property</a></h4>
-
- <p>The Combining_Character_Class (ccc) property is a hybrid type. The
- possible values defined for it in UnicodeData.txt range from 0 to 254 and are numeric
- values. However, Combining_Character_Class also has symbolic aliases defined for those particular values
- that are in actual use; those symbolic aliases are listed in PropertyValueAliases.txt.
- To produce a validating regular expression for Combining_Character_Class, concatenate
- together the symbolic aliases from PropertyValueAliases.txt, and then add the numeric
- range 0..254.</p>
-
- <p>The value 255 is reserved for use by implementations. When the
- ccc values are represented by bytes, that additional value of 255 may be used
- by an implementation for other purposes.</p>
-
- <p>The value 133 is reserved. No characters have that value. The property value alias
- CCC133 is retained in accordance with the stability policy regarding property value aliases.</p>
-
- <h4>5.11.3 <a name="Validation_of_Unihan" href="#Validation_of_Unihan">Unihan Properties</a></h4>
-
- <p>The validating regular expressions for each property tag defined
- in the Unihan database are described in detail in [<a href="../tr41/tr41-21.html#UAX38">UAX38</a>].</p>
- <h4>5.11.4 <a name="Validation_of_Other" href="#Validation_of_Other">Other Properties</a></h4>
-
- <p>Regular expressions to validate String and Miscellaneous properties
- in the UCD are provided in <i>Table 21</i>. Although Catalog properties may use
- strict tests, as described in <i>Section 5.11.1 <a href="#Validation_of_Enumerated">Enumerated and Binary Properties</a></i>,
- generic patterns for Block
- and Script are also provided in <i>Table 21</i>.</p>
-
- <p>To simplify the
- presentation of these expressions, commonly occurring subexpressions are first
- abstracted out as variables defined in <i>Table 20</i>.</p>
- <p class="caption">Table 20. <a name="Common_Subexpressions_Table" href="#Common_Subexpressions_Table">Common Subexpressions for Validation</a></p>
- <div align="center">
- <table class="simple">
- <tr>
- <th>Variable</th>
- <th>Value</th>
- <th>Notes and Examples</th>
- </tr>
- <tr>
- <td>$digit</td>
- <td>[0-9]</td>
- <td>"0", "3"</td>
- </tr>
- <tr>
- <td>$hexDigit</td>
- <td>[0-9A-F]</td>
- <td>"1", "A"</td>
- </tr>
- <tr>
- <td>$alphaNum</td>
- <td>[0-9A-Za-z]</td>
- <td>"1", "A", "z"</td>
- </tr>
- <tr>
- <td>$digits</td>
- <td>$digit+</td>
- <td>"0", "12345"</td>
- </tr>
- <tr>
- <td>$label</td>
- <td>$alphaNum+</td>
- <td>"A", "Syriac", "NGKWAEN", "123467", "A005A"</td>
- </tr>
- <tr>
- <td>$positiveDecimal</td>
- <td>$digits\.$digits</td>
- <td>"3.1"</td>
- </tr>
- <tr>
- <td>$decimal</td>
- <td>-?$positiveDecimal</td>
- <td>"3.5", "-0.5"</td>
- </tr>
- <tr>
- <td>$rational</td>
- <td>-?$digits(/$digits)?</td>
- <td>"3/4", "-3/4"</td>
- </tr>
- <tr>
- <td>$optionalDecimal</td>
- <td>-?$digits(\.$digits)?</td>
- <td>"3.5", "-0.5", "2", "1000"</td>
- </tr>
- <tr>
- <td>$name</td>
- <td>$label(( -|- |[-_ ])$label)*</td>
- <td>name, with potential non-medial hyphens</td>
- </tr>
- <tr>
- <td>$name2</td>
- <td>$label([-_ ]$label)*</td>
- <td>name, no non-medial hyphens allowed</td>
- </tr>
- <tr>
- <td>$annotatedName</td>
- <td>$name2( \(.*\))?</td>
- <td>name with optional parenthetical annotation</td>
- </tr>
- <tr>
- <td>$shortName</td>
- <td>[A-Z]{0,3}</td>
- <td>"", "O", "WA", "WAE"</td>
- </tr>
- <tr>
- <td>$codePoint</td>
- <td>(10|$hexDigit)?$hexDigit{4}</td>
- <td>"00A0", "E0100", "10FFFF"</td>
- </tr>
- <tr>
- <td>$codePoints</td>
- <td>$codePoint(\s$codePoint)*</td>
- <td>space-delimited list of 1 to n code points</td>
- </tr>
- <tr>
- <td>$codePoint0</td>
- <td>($codePoints)?</td>
- <td>space-delimited list of 0 to n code points</td>
- </tr>
- </table>
- </div>
-
- <p>The regular expressions listed in <i>Table 21</i> cover
- all the straightforward cases for other property values. For properties
- involving somewhat more irregular values, such as <a href="#Age">Age</a>,
- <a href="#ISO_Comment">ISO_Comment</a>, and <a href="#Unicode_1_Name">Unicode_1_Name</a>,
- details for validation can be found in [<a href="../tr41/tr41-21.html#UAX42">UAX42</a>].</p>
-
- <p class="caption">Table 21. <a name="Regular_Expressions_Table" href="#Regular_Expressions_Table">Regular Expressions for Other Property Values</a></p>
- <div align="center">
- <table class="simple">
- <tr>
- <th>Abbr</th>
- <th>Name</th>
- <th colspan="2">Regex for Allowable Values</th>
- </tr>
- <tr>
- <td rowspan="3">nv</td>
- <td rowspan="3">Numeric_Value</td>
- <td>/$decimal/</td>
- <td>Field 2</td>
- </tr>
- <tr>
- <td>/$optionalDecimal/</td>
- <td>Field 3</td>
- </tr>
- <tr>
- <td colspan="2">/$rational/</td>
- </tr>
- <tr>
- <td>blk</td>
- <td>Block</td>
- <td rowSpan="2" colspan="2">/$name2/</td>
- </tr>
- <tr>
- <td>sc</td>
- <td>Script</td>
- </tr>
- <tr>
- <td>dm</td>
- <td>Decomposition_Mapping</td>
- <td rowSpan="2" colspan="2">
- /$codePoints/</td>
- <tr>
- <td>FC_NFKC</td>
- <td>FC_NFKC_Closure</td>
- </tr>
- <tr>
- <td>NFKC_CF</td>
- <td>NFKC_Casefold</td>
- <td colspan="2">/$codePoint0/</td>
- </tr>
- <tr>
- <td>cf</td>
- <td>Case_Folding</td>
- <td rowSpan="4" colspan="2">
- /$codePoints/</td>
- </tr>
- <tr>
- <td>lc</td>
- <td>Lowercase_Mapping</td>
- </tr>
- <tr>
- <td>tc</td>
- <td>Titlecase_Mapping</td>
- </tr>
- <tr>
- <td>uc</td>
- <td>Uppercase_Mapping</td>
- </tr>
- <tr>
- <td>scf</td>
- <td>Simple_Case_Folding</td>
- <td rowSpan="4" colspan="2">
- /$codePoint/</td>
- </tr>
- <tr>
- <td>slc</td>
- <td>Simple_Lowercase_Mapping</td>
- </tr>
- <tr>
- <td>stc</td>
- <td>Simple_Titlecase_Mapping</td>
- </tr>
- <tr>
- <td>suc</td>
- <td>Simple_Uppercase_Mapping</td>
- </tr>
- <tr>
- <td>bmg</td>
- <td>Bidi_Mirroring_Glyph</td>
- <td colspan="2">/$codePoint/</td>
- </tr>
- <tr>
- <td>na</td>
- <td>Name</td>
- <td rowspan="3" colspan="2">/$name/</td>
- </tr>
- <tr>
- <td>Name_Alias</td>
- <td>Name_Alias</td>
- </tr>
- <tr>
- <td>--</td>
- <td>Names for named sequences*</td>
- </tr>
- <tr>
- <td>na1</td>
- <td>Unicode_1_Name</td>
- <td colspan="2">/$annotatedName/</td>
- </tr>
- <tr>
- <td>JSN</td>
- <td>Jamo_Short_Name</td>
- <td colspan="2">/$shortName/</td>
- </tr>
- </table>
- </div>
- <blockquote>
- <p>* The names for Unicode named character sequences are not formally Unicode
- character property values. However, they follow the same syntax as the Name and Name_Alias
- property values.</p>
- </blockquote>
- <h4>5.11.5 <a name="Validation_of_Multivalued" href="#Validation_of_Multivalued">Validation of Multivalued Properties</a></h4>
- <p>Some properties, such as Script_Extensions of kCantonese, have property
- values each consisting of a set of element values. In the data files, these element values
- are separated by spaces. Validation of the property values is performed by first splitting
- each set into element values at the spaces, and then validating each element value
- individually. For example, the elements for Script_Extensions are values of the
- Script property; they are validated according to the validation requirements for the
- Script property. See also Section 5.7.6 <a href="#Property_Values_As_Sets">Properties Whose Values Are Sets of Values</a>.</p>
- <p>The Name_Alias property has values which consist of sets of one or
- more name strings. In the data file for this property, each element value occurs on
- a separate line and can be validated as a separate element.</p>
-
- <h3>5.12 <a name="Deprecation" href="#Deprecation">Deprecation</a></h3>
- <p>In the Unicode Standard, the term <i>deprecation</i> is used somewhat
- differently than it is in some other standards. Deprecation is used to
- mean that a character or other feature is strongly discouraged from use.
- This should not, however, be taken as indicating that anything has been
- removed from the standard, nor that anything is <i>planned</i> for removal
- from the standard. Any such change is constrained by the
- Unicode Consortium Stability Policies [<a href="../tr41/tr41-21.html#Stability">Stability</a>].</p>
- <p>For the Unicode Character Database, there are two important types
- of deprecation to be noted. First, an <i>encoded character</i> may be
- deprecated. Second, a <i>character property</i> may be deprecated.</p>
- <p>When an encoded character is strongly discouraged from use, it is
- given the property value Deprecated=True. The <a href="#Deprecated">Deprecated</a> property
- is a binary property defined specifically to carry this information about
- Unicode characters. Very few characters are ever formally
- deprecated this way; it is not enough that a character be uncommon, obsolete,
- disliked, or not preferred. Only those few characters which have been
- determined by the UTC to have serious architectural defects or which
- have been determined to cause significant implementation problems are
- ever deprecated. Even in the most severe cases, such as the
- deprecated format control characters (U+206A..U+206F), an encoded character
- is <i>never</i> removed from the standard. Furthermore, although deprecated
- characters are strongly discouraged from use, and should be avoided in
- favor of other, more appropriate mechanisms, they <i>may</i> occur in data.
- Conformant implementations of Unicode processes such a Unicode normalization <i>must</i>
- handle even deprecated characters correctly.</p>
- <p>In the Unicode Character Database, a character property may
- also become strongly discouraged—usually because it no longer
- serves the purpose it was originally defined for. In such cases, the
- property is labelled "deprecated" in
- <i>Table 9, <a href="#Property_List_Table">Property Table</a></i>.
- For example, see the <a href="#Grapheme_Link">Grapheme_Link</a> property.
- Deprecated properties are not recommended for
- exposure in public APIs that support Unicode character properties.</p>
- <h3>5.13 <a name="Property_APIs" href="#Property_APIs">Property APIs</a></h3>
- <p>The Unicode Standard does not specify the exact form of APIs which may be defined
- in software libraries to surface Unicode character properties to applications. However, there
- are some recommendations and general guidelines to follow, which should serve to reduce
- potential confusion and to promote better interoperability between applications using
- the Unicode Character Database.</p>
- <p>In the discussion which follows here, the term <i>API</i> is
- used to refer to a particular function or method, whereas the term <i>API collection</i>
- is used to refer to a related group of APIs, which might constitute a set of functions
- exported from a library, a class definition, or other groupings of related functionality.
- A distinction is also made between a <i>public API</i>, which is exported for general
- application use, and a <i>private API</i>, which may be kept hidden within a library or
- class, intended for internal use.</p>
- <p>First, if an API surfaces values of a particular Unicode character property
- and <i>purports</i> that value to represent a Unicode character property, it should exactly
- follow the specification of that property in the UCD. This principle follows from the
- general approach to conformance for the Unicode Standard: If you say it is Unicode,
- then it should follow the Unicode Standard specification.</p>
- <p>Second, an API should be clear about which version of the UCD it
- supports. This can be done, for example, with documentation, either external or
- included in the source in header files, class definition notes, and so forth.
- For an API collection, an even better option is to include an API which explicitly
- reports which version of the UCD is supported.
- This provision should reduce confusion regarding particular property
- values which might change between versions of the Unicode Standard, as well as making
- it clear which repertoire of encoded characters is intended to be covered. There is
- no principled constraint on an API supporting <i>more than one</i> version of the UCD, as long
- as it is clear about how it does so.</p>
- <p>Third, although there is no constraint on an API declaring that it
- only supports a designated subset of Unicode characters, best practice for a general
- purpose character property API would be to support the entire range of Unicode
- code points, providing determinant and well-documented property values for any valid Unicode
- code point input. That would include providing correct default property values for
- any unassigned code point. See <i>Section 2.2, <a href="#Use_Default">Use of Default Values</a></i>
- for an explanation of that concept.</p>
- <p>Fourth, a Unicode character property API is not precluded from
- extending or tailoring its support of character properties, as long as such
- behavior is clearly documented, so that applications understand the values they
- will be getting by calling the API. For example, an API might surface an
- extended new property such as IsDanda, which is not formally part of the
- properties specified by the UCD, but which can be inferred from the
- documentation of the Unicode Standard. An API supporting a particular
- tailoring of the Unicode Line Breaking Algorithm could surface tailored
- Line_Break property values to support that behavior. Alternatively, an API supporting
- a particular private use agreement could surface privately-defined properties
- for a designated range of PUA characters. All such use of APIs should be
- considered conformant ways of extending API collections using the UCD.</p>
- <p>Designers of API collections to support Unicode character properties must
- also be aware that not all Unicode character properties are equal. There is no
- requirement, express or implied, that <i>all</i> Unicode character properties
- should be supported in a given API collection. In fact, an approach that simply parses
- the UCD and surfaces <i>all</i> Unicode character properties verbatim is
- very likely to result in a bad design. Character properties need to be
- understood in the context of the various Unicode algorithms they are designed
- to support.</p>
- <p>The following subtypes of
- Unicode character properties should generally <i>not</i> be exposed in APIs,
- except in limited circumstances. They may not be useful, particularly
- in public API collections, and may instead prove misleading to the users
- of such API collections.</p>
- <ul>
- <li><i><a href="#Contributory_Properties">Contributory properties</a></i> are not recommended for public APIs.</li>
- <li>A subset of Unicode normalization-related properties are not recommended for public APIs. See
- <i>Section 5.7.5, <a href="#Decompositions_and_Normalization">Decompositions and Normalization</a></i>.</li>
- <li>Deprecated properties are not recommended for public APIs. See
- <i>Section 5.12, <a href="#Deprecation">Deprecation</a></i>.</li>
- </ul>
- <h3>5.14 <a name="Character_Age" href="#Character_Age">Character Age</a></h3>
- <p>The <a href="#Age">Age</a> property indicates the first version in which a
- particular Unicode character was assigned. For example, U+20AC € EURO SIGN was
- added to Version 2.1 of the Unicode Standard, so it has age=2.1, while
- U+20B9 ₹ INDIAN RUPEE SIGN was added to Version 6.0 of the Unicode Standard,
- so it has age=6.0.</p>
- <p>The short values for the Age property for assigned (designated) code points are of the form "m.n",
- with the first field corresponding to the major version, and the second field corresponding
- to the minor version. There is no need for a third version field, because new
- characters are never assigned in update versions of
- the standard. The long
- values for the Age property for assigned code points start with a "V" and use an underscore instead
- of a dot between the major and minor version numbers: V2_1, V6_0, and so on. This
- makes the long format more useful as an identifier in programming languages. It is
- also useful in regular expressions, where the dot has other significance.</p>
- <p>The default value of the Age property, used for unassigned (undesignated) code points,
- is expressed with labels that depart from the numerical versioning scheme
- of the Age property for assigned code points; the short form for the default is "NA",
- and the long form for the default is "Unassigned". Implementations of parsers
- which manipulate the Age property need to be prepared for this special case,
- rather than expecting the default value to be expressed numerically, as "0.0", for example.</p>
- <p>The Age property is
- based on when a character is encoded in the standard. It is normative and immutable, and
- cannot be meaningfully tailored.</p>
- <p>The minimum value of the Age property is "1.1",
- instead of "1.0", because of the substantial and
- incompatible changes to the standard resulting from the merger of code points and
- character names between the Unicode Standard and ISO/IEC 10646 for their 1993
- publications. For Hangul syllable characters, which were
- extensively augmented in Unicode 2.0, the Age value is set to "2.0", even
- though a subset of the Hangul syllables had been published in earlier versions,
- at different code points.</p>
- <p>Private use characters, noncharacter code points, and surrogate code
- points also get Age values. The private use characters and noncharacter code
- points on the BMP have age=1.1. However, the full architecture for UTF-16 and multiple planes
- was not fully documented until Unicode 2.0, so the private use characters and
- noncharacter code points on supplementary planes, as well as the surrogate
- code points in the range D800..DFFF, are given the value age=2.0.</p>
- <p>The Age property cannot be derived from the other
- data files in any single version of the Unicode Character Database. Its derivation
- is done, rather, by tools that compare the assigned characters <i>between</i>
- subsequent versions. The data file <a href="#DerivedAge.txt">DerivedAge.txt</a>
- provides the definitive listing of the
- Age property value for all code points, as of that version of the standard.</p>
- <p>The typical use case for the Age property in regular expressions
- is to search for all characters that were
- present in a given version. For this reason,
- an expression such as "\p{age=V3_0}" is exceptionally
- defined to match all of the code
- points assigned in Version 3.0—that is, all the code points with
- a value <i>less than or equal to</i> the value 3.0 for the Age property, rather than
- just the subset of those code points with the value 3.0. This interprets
- "\p{age=V3_0}"
- as the set of all characters assigned as of Unicode 3.0, rather than
- as just the set of characters <i>added</i> to Unicode 3.0 subsequent to the
- prior version. For more
- information, see Unicode Technical Standard #18,
- "Unicode Regular Expressions" [<a href="../tr41/tr41-21.html#UTS18">UTS18</a>].</p>
- <h2>6 <a name="Test_Files" href="#Test_Files">Test Files</a></h2>
-
- <p>The UCD contains a number of test data files.
- Those provide data in standard formats which can be used to test
- implementations of Unicode algorithms. The test data files
- distributed with this version of the UCD are listed in
- <i>Table 22</i>.</p>
-
- <p class="caption">Table 22. <a name="Algorithm_Test_Table" href="#Algorithm_Test_Table">Unicode Algorithm Test Data Files</a></p>
- <div align="center">
- <table class="simple">
- <tr>
- <th>File Name</th>
- <th>Specification</th>
- <th>Status</th>
- <th>Unicode Algorithm</th>
- </tr>
- <tr>
- <td>BidiTest.txt</td>
- <td>[<a href="../tr41/tr41-21.html#UAX9">UAX9</a>]</td>
- <td style="text-align:center">N</td>
- <td>Unicode Bidirectional Algorithm</td>
- </tr>
- <tr>
- <td>BidiCharacterTest.txt</td>
- <td>[<a href="../tr41/tr41-21.html#UAX9">UAX9</a>]</td>
- <td style="text-align:center">N</td>
- <td>Unicode Bidirectional Algorithm</td>
- </tr>
- <tr>
- <td>NormalizationTest.txt</td>
- <td>[<a href="../tr41/tr41-21.html#UAX15">UAX15</a>]</td>
- <td style="text-align:center">N</td>
- <td>Unicode Normalization Algorithm</td>
- </tr>
- <tr>
- <td>LineBreakTest.txt</td>
- <td>[<a href="../tr41/tr41-21.html#UAX14">UAX14</a>]</td>
- <td style="text-align:center">N</td>
- <td>Unicode Line Breaking Algorithm</td>
- </tr>
- <tr>
- <td>GraphemeBreakTest.txt</td>
- <td>[<a href="../tr41/tr41-21.html#UAX29">UAX29</a>]</td>
- <td style="text-align:center">N</td>
- <td>Grapheme Cluster Boundary Determination</td>
- </tr>
- <tr>
- <td>WordBreakTest.txt</td>
- <td>[<a href="../tr41/tr41-21.html#UAX29">UAX29</a>]</td>
- <td style="text-align:center">N</td>
- <td>Word Boundary Determination</td>
- </tr>
- <tr>
- <td>SentenceBreakTest.txt</td>
- <td>[<a href="../tr41/tr41-21.html#UAX29">UAX29</a>]</td>
- <td style="text-align:center">N</td>
- <td>Sentence Boundary Determination</td>
- </tr>
- </table>
- </div>
-
- <p>The normative status of these test files reflects their use to
- determine the correctness of implementations claiming conformance
- to the respective algorithms listed in the table. There is no
- requirement that any particular Unicode implementation also
- implement the Unicode Line Breaking Algorithm, for example, but
- <i>if</i> it implements that algorithm correctly, it should be
- able to replicate the test case results specified in the
- data entries in LineBreakTest.txt.</p>
- <h3>6.1 <a name="NormalizationTest_txt" href="#NormalizationTest_txt"> NormalizationTest.txt </a></h3>
- <p>This file contains data which can be used to test an implementation of the
- Unicode Normalization Algorithm.
- (See [<a href="../tr41/tr41-21.html#UAX15">UAX15</a>] and [<a href="../tr41/tr41-21.html#Tests15">Tests15</a>].)</p>
-
- <p>The data file has a Unicode string in the first field (which may consist
- of just a single code point). The next four fields then specify the expected
- output results of converting that string to Unicode Normalization Forms
- NFC, NFD, NFKC, and NFKD, respectively. There are many tricky edge cases
- included in the input data, to ensure that implementations have correctly
- implemented some of the more complex subtleties of the Unicode Normalization
- Algorithm.</p>
-
- <p>The header section of NormalizationTest.txt provides additional information
- regarding the normalization invariant relations that any conformant
- implementation should be able to replicate.</p>
-
- <p>The Unicode Normalization Algorithm is not tailorable. Conformant
- implementations should be expected to produce results as specified in
- NormalizationTest.txt and should not deviate from those results.</p>
- <h3>6.2 <a name="Segmentation_Test_Files" href="#Segmentation_Test_Files">Segmentation Test Files and Documentation</a></h3>
- <p>LineBreakTest.txt, located in the auxiliary directory of the UCD,
- contains data which can be used
- to test an implementation of the Unicode Line Breaking Algorithm.
- (See [<a href="../tr41/tr41-21.html#UAX14">UAX14</a>] and [<a href="../tr41/tr41-21.html#Tests14">Tests14</a>].) The header of
- that file specifies the data format and the use of the test data to
- specify line break opportunities. Note that non-ASCII characters are used
- in this test data as field delimiters.</p>
-
- <p>There is an associated documentation file, LineBreakTest.html, which displays
- the results of the Line Breaking Algorithm in an interactive chart form, with a
- documented listing of the rules.</p>
- <p>The Unicode text segmentation test data files are also located in the
- auxiliary directory of the UCD. (See [<a href="../tr41/tr41-21.html#Tests29">Tests29</a>].) They
- contain data which can be used to test an implementation of the segmentation
- algorithms specified in [<a href="../tr41/tr41-21.html#UAX29">UAX29</a>].
- The headers of
- those file specify the data format and the use of the test data to
- specify text segmentation opportunities. Note that non-ASCII characters are used
- in this test data as field delimiters.</p>
-
- <p>There are also associated documentation
- files, which display the results of the segmentation algorithms in an
- interactive chart form, with a documented listing of the rules:</p>
- <ul>
- <li>GraphemeBreakTest.html </li>
- <li>SentenceBreakTest.html </li>
- <li>WordBreakTest.html </li>
- </ul>
-
- <p>Unlike the Unicode Normalization Algorithm, the Unicode Line Breaking
- Algorithm and the various text segmentation algorithms are tailorable,
- and there is every expectation that implementations will tailor these
- algorithms to produce results as needed. The test data files only test
- the <i>default</i> behavior of the algorithms. Testing of tailored implementations
- will need to modify and/or extend the test cases as appropriate to match
- any documented tailoring.</p>
-
- <h3>6.3 <a name="BidiTest_txt" href="#BidiTest_txt">Bidirectional Test Files</a></h3>
- <p>These files contain data
- which can be used to test an implementation of the
- Unicode Bidirectional Algorithm.
- (See [<a href="../tr41/tr41-21.html#UAX9">UAX9</a>] and [<a href="../tr41/tr41-21.html#Tests9">Tests9</a>].)</p>
-
- <p>The data in BidiTest.txt is intended to exhaustively test
- all possible combinations of Bidi_Class values for strings of length four or less.
- To allow for the resulting very large number of test cases,
- the data file has a somewhat complicated format which is
- described in the header. Fundamentally, for each input string and for each
- possible input paragraph level, the test data specifies the resulting bidi levels and
- expected reordering.</p>
-
- <p>The data in BidiCharacterTest.txt is provided to test various
- edge cases for the algorithm. It contains an extra field which allows for explicit
- control of the overall directional context for each test case.</p>
- <p>The Unicode Bidirectional Algorithm is tailorable within certain limits.
- Conformant implementations with no tailoring are expected to produce the results as
- specified in BidiTest.txt and BidiCharacter.txt, and should not deviate from those results. Tailored
- implementations can also use the data in
- the test files to test for overall conformance
- to the algorithm by changing the assignment of properties to characters to reflect
- the details of their tailoring.</p>
- <h2>7 <a name="Change_History" href="#Change_History">UCD Change History</a></h2>
- <p>This section summarizes the recent
- changes to the UCD—including its documentation files—and
- is organized by Unicode versions.</p>
- <p>References in the change history
- are often made to a Public Review Issue (PRI). See
- <a href="http://www.unicode.org/review/resolved.html">
- http://www.unicode.org/review/resolved.html</a> for more information about
- each of those cases.</p>
- <hr>
- <h3><a name="Unicode_10.0.0" href="http://www.unicode.org/versions/components-10.0.0.html">Unicode 10.0.0</a></h3>
- <p><b>Changes in specific files:</b></p>
- <p>New data files were added to the UCD: two primary files, NushuSources.txt and VerticalOrientation.txt,
- documented in this section, as well as an extracted file, DerivedName.txt, generated from UnicodeData.txt.</p>
- <p>The documentation file StandardizedVariants.html, already obsoleted as of Version 9.0,
- was removed altogether from the UCD. Its function was superseded by the Unicode code charts and the emoji charts.</p>
- <p>Appropriate existing data files were updated to add the 8,518 new characters encoded in Unicode 10.0,
- which consist of 7,494 CJK unified ideographs and 1,024 other characters.
- Major changes that are most likely to affect implementations are documented
- in <a href="http://www.unicode.org/versions/Unicode10.0.0/#Migration">Section M of the Unicode 10.0.0 page</a>.
- Detailed data file updates resulting from encoding the new characters and from various character
- property changes are summarized below, in the same grouping manner used in
- <a href="http://www.unicode.org/versions/components-10.0.0.html">Components of Unicode 10.0.0</a>.</p>
- <p>Note that minor editorial updates and changes to the derived and extracted data files are not documented here.</p>
- <h4>Core Data</h4>
- <ul>
- <li>ArabicShaping.txt
- <ul>
- <li>Entries were added for the 11 letters in the new Syriac Supplement block.
- The letters, used for writing a dialect of the Malayalam language in the Syriac script (a form of Garshuni),
- each have their own joining group, with schematic names that include the word MALAYALAM.</li>
- </ul>
- </li>
- <li>Blocks.txt
- <ul>
- <li>Seven new blocks were added, including blocks for the four new scripts, Masaram Gondi, Nushu, Soyombo, and Zanabazar Square.</li>
- <li>A Syriac Supplement block was added to the right-to-left area of the Basic Multilingual Plane.</li>
- <li>A large collection of rare and historic CJK unified ideographs was added in a new block, CJK Unified Ideographs Extension F.
- A set of 21 CJK unified ideographs was also added to the main CJK Unified Ideographs block.</li>
- <li>A set of 285 Hentaigana characters was added to the existing Kana Supplement block and a new, adjacent block, Kana Extended-A.</li>
- </ul>
- </li>
- <li>EastAsianWidth.txt
- <ul>
- <li>The following sets of newly encoded characters were assigned the East_Asian_Width property value Wide:
- the new CJK unified ideographs; the Hentaigana characters; all Nushu characters, including the iteration mark, U+16FE1;
- and one new Bopomofo letter, U+312E.</li>
- <li>The 56 newly encoded pictographic symbols that have the Emoji_Presentation property as of Version 5.0 of
- Unicode Technical Standard #51, "Unicode Emoji", were also assigned the East_Asian_Width property value Wide
- [<a href="../tr41/tr41-21.html#UTS51">UTS51</a>].</li>
- <li>All of the other new characters, including new symbols, were assigned the East_Asian_Width property value Neutral.</li>
- </ul>
- </li>
- <li>EmojiSources.txt
- <ul>
- <li>There were no data additions or changes, but a comment was added to document that 11 mappings for keycap sequences
- are historical and differ from the named character sequences with keycaps listed in NamedSequences.txt and the
- corresponding UTS #51 emoji keycap sequences.</li>
- </ul>
- </li>
- <li>IndicPositionalCategory.txt
- <ul>
- <li>Entries were added for the matras and non-vocalic marks of the three Brahmi-derived scripts introduced in
- Unicode 10.0—Masaram Gondi, Soyombo, and Zanabazar Square.</li>
- <li>Entries were also added for new marks of existing Indic scripts, namely Gujarati and Malayalam, as well as a new Vedic mark.</li>
- <li>The Indic_Positional_Category property value of U+A9BF JAVANESE CONSONANT SIGN CAKRA was corrected from Right to Bottom_And_Left.
- The latter is a new Indic_Positional_Category property value, for which a new section was added to the file.</li>
- </ul>
- </li>
- <li>IndicSyllabicCategory.txt
- <ul>
- <li>Characters in the three newly encoded Brahmi-derived scripts, as well as new characters of existing Indic scripts,
- were added with appropriate property values.</li>
- <li>The three newly encoded Gujarati nukta characters, U+0AFD..U+0AFF, were assigned the Indic_Syllabic_Category property
- value Nukta, although their Canonical_Combining_Class property value was set to 0 (Not_Reordered) rather than 7 (Nukta).
- Due to the increased number of exceptions, the derivation expression for the Indic_Syllabic_Category value Nukta
- was removed from the comment lines in the file.</li>
- <li>The classification of several previously encoded Tai Tham characters was revised based on expert feedback.</li>
- <li>A few previously encoded Devanagari and Grantha nasalization signs were assigned the Indic_Syllabic_Category property value Bindu.</li>
- <li>The documentation of a few syllabic categories was also expanded.</li>
- </ul>
- </li>
- <li>LineBreak.txt
- <ul>
- <li>Newly encoded characters were assigned appropriate Line_Break property values.</li>
- <li>The newly encoded U+20BF BITCOIN SIGN was assigned the Line_Break property value Prefix_Numeric,
- the default for currency symbols.</li>
- <li>Of the 56 newly encoded emoji symbols, the 16 which appear as bases in valid emoji modifier sequences or, equivalently,
- have the Emoji_Modifier_Base property as of Version 5.0 of Unicode Technical Standard #51, "Unicode Emoji",
- were assigned the Line_Break property value E_Base [<a href="../tr41/tr41-21.html#UTS51">UTS51</a>].
- That value represents a change from the default value Line_Break=Ideographic for all unassigned code points in
- the range U+1F000..U+1FFFD.
- The other 40 new emoji were assigned the Line_Break property value Ideographic.</li>
- <li>The new Typicon symbols in the range U+1F900..U+1F90B were assigned the Line Break property value Alphabetic.</li>
- <li>Five previously encoded emoji symbols (U+1F3C2, U+1F3C7, U+1F3CC, U+1F574, and U+1F6CC) changed their
- Line_Break property value from Ideographic to E_Base, because they are included in the set of bases
- for valid emoji modifier sequences as of UTS #51 Version 5.0. (They were added to that set in UTR #51 Version 4.0.)</li>
- <li>Conversely, two previously encoded emoji symbols (U+1F91D and U+1F93C) changed their Line_Break property value
- from E_Base to Ideographic, because they no longer appear in valid emoji modifier sequences as of
- UTS #51 Version 5.0. (They were removed from that set in UTR #51 Version 4.0.)</li>
- <li>No other existing characters changed their Line_Break property values.</li>
- </ul>
- </li>
- <li>NameAliases.txt
- <ul>
- <li>Four formal aliases of type "correction" were added for U+11EC..U+11EF, noting that "YESIEUNG" is the correct identification of the character component termed "IEUNG" in the character names.</li>
- <li>One formal alias of type "correction" was added for U+1B001, to identify it as part of the complete hentaigana set.</li>
- </ul>
- </li>
- <li>NamedSequences.txt
- <ul>
- <li>The set of 12 named character sequences used for emoji keycap sequences was moved from NamedSequencesProv.txt
- to this file, as the named sequences advanced from provisional to approved status.</li>
- </ul>
- </li>
- <li>NamedSequencesProv.txt
- <ul>
- <li>The set of 12 named character sequences used for emoji keycap sequences was moved from this file
- to NamedSequences.txt, as the named sequences advanced from provisional to approved status.</li>
- </ul>
- </li>
- <li>NamesList.txt
- <ul>
- <li>Content was updated throughout with new characters, as well as annotations, cross references, subheadings, and remarks.</li>
- </ul>
- </li>
- <li>NushuSources.txt
- <ul>
- <li>This new data file was added to the UCD. It contains source mappings and readings for Nushu ideographs,
- as well as radical-stroke data for the ideographs, in the same format as the Unihan data files and TangutSources.txt.</li>
- </ul>
- </li>
- <li>PropertyAliases.txt
- <ul>
- <li>An entry was added for the enumerated property Vertical_Orientation, abbreviated vo, which was incorporated in the UCD.</li>
- <li>An entry was added for the newly defined binary property, Regional_Indicator, abbreviated RI.</li>
- </ul>
- </li>
- <li>PropertyValueAliases.txt
- <ul>
- <li>The 10.0 value, with the alias V10_0, was added to the catalog property Age.</li>
- <li>Script and Block property values were listed for the four new scripts and seven new blocks introduced.</li>
- <li>Entries were added for the 11 new Joining_Group property values introduced with the Malayalam Garshuni letters
- in the new Syriac Supplement block.</li>
- <li>New sections were added for the values of the newly defined binary property Regional_Indicator and
- the enumerated property Vertical_Orientation.</li>
- <li>An entry was added for a new Indic_Positional_Category property value, Bottom_And_Left.</li>
- </ul>
- </li>
- <li>PropList.txt
- <ul>
- <li>Most of the newly encoded combining marks were assigned either the contributory property Other_Alphabetic
- or the binary property Diacritic, as appropriate.</li>
- <li>Newly encoded punctuation characters that mark the end of various sections of text, such as dandas,
- were assigned the appropriate binary properties Terminal_Punctuation or Sentence_Terminal.</li>
- <li>All 7,494 new CJK unified ideographs were assigned both the Ideographic and the Unified_Ideograph binary properties.</li>
- <li>The newly encoded Nushu ideographs (which do not include the iteration mark U+16FE1) were assigned
- the Ideographic property, but not the Unified_Ideograph property.</li>
- <li>The newly encoded characters U+16FE1 NUSHU ITERATION MARK and U+11A98 SOYOMBO GEMINATION MARK
- were assigned the binary property Extender.</li>
- <li>A section was added for the 26 regional indicator characters, U+1F1E6..U+1F1FF, which were assigned
- the newly defined binary property Regional_Indicator.</li>
- </ul>
- </li>
- <li>Scripts.txt
- <ul>
- <li>The new characters were assigned appropriate Script property values, including four new values
- for the newly encoded scripts: Masaram_Gondi, Nushu, Soyombo, and Zanabazar_Square.</li>
- <li>The newly encoded emoji were assigned the Script property value Common, in a manner consistent with
- similar characters encoded previously.</li>
- <li>The ideographs in the new block CJK Unified Ideographs Extension F, as well as the 21 added to the main
- CJK Unified Ideographs block, were assigned the Script property Han.</li>
- <li>The newly encoded Japanese hentaigana characters were assigned the Script property value Hiragana,
- as hentaigana are effectively historic variants of Hiragana syllables.</li>
- <li>The Malayalam Garshuni letters in the new Syriac Supplement block were assigned the Script property value Syriac.</li>
- <li>Other script specific characters were assigned respective Script property values:
- Bengali, Bopomofo, Gujarati, Malayalam, and Old_Italic.</li>
- <li>The Script property value of U+061C ARABIC LETTER MARK (ALM) was changed from Common to Arabic,
- the initial value that the character had taken when it was encoded in Unicode 6.3.
- The change was made so the character can have the same effects on digit substitution as regular Arabic letters.</li>
- <li>The change for ALM was the only change in Script property values for existing characters.</li>
- </ul>
- </li>
- <li>ScriptExtensions.txt
- <ul>
- <li>The newly encoded U+1CF7 VEDIC SIGN ATIKRAMA was assigned the Script_Extensions property value {Bengali},
- as the character is attested in Bengali publications while not being script specific, which is typical for Vedic marks.</li>
- <li>The existing character U+11301 GRANTHA SIGN CANDRABINDU was assigned the Script_Extensions property value
- {Grantha Tamil} based on attested use with Tamil for writing Sanskrit.</li>
- <li>The Script_Extensions property value of U+061C ARABIC LETTER MARK (ALM) was changed to {Arabic Syriac Thaana},
- the initial value that the character had taken when it was encoded in Unicode 6.3.
- The Script_Extensions change was made in conjunction with the Script change to Arabic for U+061C.</li>
- </ul>
- </li>
- <li>StandardizedVariants.txt
- <ul>
- <li>All of the variation sequences involving emoji, now known more specifically as emoji presentation sequences
- and text presentation sequences, were moved from StandardizedVariants.txt to the UTS #51 data file
- emoji-variation-sequences.txt. The latter is a new data file accompanying Version 5.0 of UTS #51,
- "Unicode Emoji", whose emoji character repertoire corresponds to Unicode 10.0
- [<a href="../tr41/tr41-21.html#UTS51">UTS51</a>].</li>
- <li>Corrections were made to the labels of several Mongolian standardized variation sequences,
- but without changes to the actual character sequences.</li>
- </ul>
- </li>
- <li>UnicodeData.txt
- <ul>
- <li>Entries were added for the 8,518 new characters, including letters, combining marks, digits, symbols, and punctuation marks.</li>
- <li>The new characters include a total of 7,494 CJK unified ideographs, of which 21 were allocated at the end of
- the CJK Unified Ideographs block, thus changing the last assigned code point in that block from U+9FD5 to U+9FEA.</li>
- <li>The other 7,473 ideographs, allocated in the new block CJK Unified Ideographs Extension F, are in the range
- U+2CEB0..U+2EBE0, written using the syntax for large ranges of characters with algorithmically derived names.</li>
- <li>The newly encoded Nushu ideographs in the range U+1B170..U+1B2FB also have algorithmic names with the prefix
- "NUSHU CHARACTER-", but were listed individually.</li>
- <li>Among the new nonspacing combining marks, there are 12 which have nonzero Canonical_Combining_Class values.</li>
- <li>The three newly encoded Gujarati nukta characters, U+0AFD..U+0AFF, were assigned the Canonical_Combining_Class property
- value 0 (Not_Reordered) rather than 7 (Nukta), although they were given the Indic_Syllabic_Category property value Nukta.
- The Canonical_Combining_Class assignment is because those characters have specialized use and interact with other
- Gujarati nonspacing marks used for transliteration of Arabic, added in Version 10.0.</li>
- <li>The 11 new Malayalam Garshuni letters have the Bidi_Class property value Arabic_Letter, similar to the existing Syriac letters.</li>
- <li>The new repertoire does not include any cased letters or any characters with nontrivial decomposition mappings.</li>
- <li>There were also no changes in General_Category property values of existing characters in this version.</li>
- </ul>
- </li>
- <li>VerticalOrientation.txt
- <ul>
- <li>This data file, which lists the Vertical_Orientation property values, was formally included in the UCD.</li>
- <li>The newly encoded symbol U+2BD2 GROUP MARK was assigned the Vertical_Orientation property value Rotated.</li>
- <li>All of the code points (assigned characters and unassigned code points) in the following new blocks
- were assigned the Vertical_Orientation property value Upright: Kana Extended-A, Nushu, Soyombo, and Zanabazar Square.</li>
- <li>Other newly encoded characters were assigned Vertical_Orientation property values that did not differ
- from the prior defaults for their code points.</li>
- </ul>
- </li>
- </ul>
- <h4>Unihan Database (Unihan.zip)</h4>
- <ul>
- <li>Unihan_DictionaryIndices.txt
- <ul>
- <li>A few corrections were made in the dictionary index data for a small number of CJK unified ideographs.</li>
- </ul>
- </li>
- <li>Unihan_DictionaryLikeData.txt
- <ul>
- <li><i>Cangjie</i> input codes were added for 40 of the characters in the URO extension area at the end of
- the CJK Unified Ideographs block, and for one character from the CJK Unified Ideographs Extension B block.</li>
- </ul>
- </li>
- <li>Unihan_IRGSources.txt
- <ul>
- <li>kIRG_USource and kRSUnicode field values were added for the 21 new characters encoded in the range U+9FD6..U+9FEA,
- at the end of the CJK Unified Ideographs block.</li>
- <li>IRG source data and kRSUnicode field values were added for the characters in the newly encoded
- CJK Unified Ideographs Extension F block.</li>
- <li>kIRG_GSource field values were added for the 12 unified ideographs in the CJK Compatibility Ideographs block,
- and for a few compatibility ideographs in the same block.</li>
- <li>A correction was made in the kRSUnicode field value of U+7353.</li>
- </ul>
- </li>
- <li>Unihan_RadicalStrokeCounts.txt
- <ul>
- <li>A correction was made in the kRSKangXi field value of U+7353, in coordination with the similar kRSUnicode
- correction in Unihan_IRGSources.txt.</li>
- </ul>
- </li>
- <li>Unihan_Readings.txt
- <ul>
- <li>Many additions, corrections, and other updates were made in kHanyuPinyin and kDefinition field values.</li>
- <li>Other additions and updates were made in a number of kCantonese field values, as well as a few kVietnamese and kMandarin field values.</li>
- </ul>
- </li>
- <li>Unihan_Variants.txt
- <ul>
- <li>The trailing blanks were deleted from a few kSemanticVariant field values, with no other changes in actual data.</li>
- </ul>
- </li>
- </ul>
- <h4>Data for UAX #45</h4>
- <ul>
- <li>USourceData.txt
- <ul>
- <li>Six unencoded CJK ideographs were added as UTC-Source ideographs, with the identifiers UTC-02970 through UTC-02975.</li>
- <li>Minor updates were made in the header, to reflect the encoding of the CJK Unified Ideographs Extension F block.</li>
- </ul>
- </li>
- <li>USourceGlyphs.pdf
- <ul>
- <li>Glyphs were added for the six UTC-Source ideographs introduced in USourceData.txt.</li>
- </ul>
- </li>
- </ul>
- <h4>Conformance Test Data</h4>
- <ul>
- <li>NormalizationTest.txt
- <ul>
- <li>Test cases were added with sequences exercising the 12 newly encoded characters which are
- nonspacing combining marks with nonzero Canonical_Combining_Class property values.</li>
- </ul>
- </li>
- </ul>
- <h4>Auxiliary Data for UAX #14 and UAX #29</h4>
- <ul>
- <li>GraphemeBreakProperty.txt
- <ul>
- <li>Entries were added for the newly encoded characters that were assigned the Grapheme_Cluster_Break property values
- Extend, Prepend, and SpacingMark, according to the derivation expressions of those property values.</li>
- <li>Emoji symbols, both existing and newly encoded, were assigned the Grapheme_Cluster_Break property values
- E_Base and Glue_After_Zwj, based on their classification in Version 5.0 of UTS #51, "Unicode Emoji"
- [<a href="../tr41/tr41-21.html#UTS51">UTS51</a>].</li>
- <li>In particular, the set of Glue_After_Zwj characters includes the old symbols U+2640 FEMALE SIGN,
- U+2642 MALE SIGN, and U+2695 STAFF OF AESCULAPIUS, which have the UTS #51 binary property Emoji=Yes.
- (They were assigned that property in UTR #51 Version 4.0.)</li>
- </ul>
- </li>
- <li>GraphemeBreakTest.txt
- <ul>
- <li>The instances of U+2764 HEAVY BLACK HEART in existing test cases were replaced by U+2640 FEMALE SIGN,
- as sample character for the Grapheme_Cluster_Break class Glue_After_Zwj.</li>
- </ul>
- </li>
- <li>LineBreakTest.txt
- <ul>
- <li>Minor edits were made to the documentation, for clarity and as a result of the removal of the pair table from
- UAX #14, "Unicode Line Breaking Algorithm" [<a href="../tr41/tr41-21.html#UAX14">UAX14</a>].</li>
- </ul>
- </li>
- <li>SentenceBreakProperty.txt
- <ul>
- <li>Entries were added for the newly encoded characters that were assigned the Sentence_Break property values
- Extend, Numeric, OLetter, and STerm, according to the derivation expressions of those property values.</li>
- </ul>
- </li>
- <li>WordBreakProperty.txt
- <ul>
- <li>Entries were added for the newly encoded characters that were assigned the Word_Break property values
- ALetter, Extend, and Numeric, according to the derivation expressions of those property values.</li>
- <li>A set of 34 phonetic modifiers with the General_Category property value Modifier_Symbol were assigned
- the Word_Break property value ALetter. The value was assigned according to the revised definition of ALetter
- in UAX #29, "Unicode Text Segmentation", by direct assignment, without changing the Alphabetic
- or General_Category properties of the affected characters [<a href="../tr41/tr41-21.html#UAX29">UAX29</a>].</li>
- <li>The Word_Break property value of U+02D7 MODIFIER LETTER MINUS SIGN was changed from MidLetter to ALetter,
- as part of the same reclassification of phonetic modifiers.</li>
- <li>Emoji symbols, both existing and newly encoded, were assigned the Word_Break property values
- E_Base and Glue_After_Zwj, based on their classification in Version 5.0 of UTS #51, "Unicode Emoji"
- [<a href="../tr41/tr41-21.html#UTS51">UTS51</a>].</li>
- <li>In particular, the set of Glue_After_Zwj characters includes the old symbols U+2640 FEMALE SIGN,
- U+2642 MALE SIGN, and U+2695 STAFF OF AESCULAPIUS, which have the UTS #51 binary property Emoji=Yes.
- (They were assigned that property in UTR #51 Version 4.0.)</li>
- </ul>
- </li>
- <li>WordBreakTest.txt
- <ul>
- <li>The instances of U+2764 HEAVY BLACK HEART in existing test cases were replaced by U+2640 FEMALE SIGN,
- as sample character for the Word_Break class Glue_After_Zwj.</li>
- </ul>
- </li>
- </ul>
- <h4>Documentation for Auxiliary Data</h4>
- <ul>
- <li>GraphemeBreakTest.html
- <ul>
- <li>The instances of U+2764 HEAVY BLACK HEART in test cases and chart tooltips were replaced by
- U+2640 FEMALE SIGN, as sample character for the Grapheme_Cluster_Break class Glue_After_Zwj.</li>
- <li>Editorial updates were made to the documentation contained in the file.</li>
- </ul>
- </li>
- <li>LineBreakTest.html
- <ul>
- <li>Minor edits were made to the documentation, for clarity and as a result of the removal of the pair table from
- UAX #14, "Unicode Line Breaking Algorithm" [<a href="../tr41/tr41-21.html#UAX14">UAX14</a>].</li>
- </ul>
- </li>
- <li>SentenceBreakTest.html
- <ul>
- <li>Editorial updates were made to the documentation contained in the file.</li>
- </ul>
- </li>
- <li>WordBreakTest.html
- <ul>
- <li>The instances of U+2764 HEAVY BLACK HEART in test cases and chart tooltips were replaced by
- U+2640 FEMALE SIGN, as sample character for the Word_Break class Glue_After_Zwj.</li>
- <li>Editorial updates were made to the documentation contained in the file.</li>
- </ul>
- </li>
- </ul>
- <hr>
- <h3><a name="Unicode_9.0.0" href="http://www.unicode.org/versions/components-9.0.0.html">Unicode 9.0.0</a></h3>
- <p><b>Changes in specific files:</b></p>
- <p>Appropriate data files were updated to add the 7,500 new characters encoded in Unicode 9.0,
- which consist of 6,881 Tangut characters and 619 other characters.
- Major changes that are most likely to affect implementations are documented
- in <a href="http://www.unicode.org/versions/Unicode9.0.0/#Migration">Section M of the Unicode 9.0.0 page</a>.
- Detailed data file updates resulting from encoding the new characters and from various character
- property changes are summarized below, in the same grouping manner used in
- <a href="http://www.unicode.org/versions/components-9.0.0.html">Components of Unicode 9.0.0</a>.
- <p>Note that minor editorial updates and changes to the derived and extracted data files are not documented here.</p>
- <p>Also note that citations of UTR #51, "Unicode Emoji" in this section refer to UTR #51 prior to
- Version 5.0 [<a href="../tr41/tr41-21.html#UTR51">UTR51</a>].</p>
- <h4>Core Data</h4>
- <ul>
- <li>ArabicShaping.txt
- <ul>
- <li>Entries were added for the newly encoded Arabic letters, as well as the new prefixed format control U+08E2.
- These include three letters used for Warsh orthography, U+08BB..U+08BD, which define their own new joining groups,
- AFRICAN FEH, AFRICAN QAF, and AFRICAN NOON.</li>
- <li>Entries were added for the letters of the newly encoded Adlam script, all of which are dual joining.</li>
- <li>U+202F NARROW NO-BREAK SPACE was explicitly listed for emphasis, because it influences shaping in Mongolian,
- without having changed its joining properties.</li>
- <li>The Joining_Type property value of the Mongolian baluda characters, U+1885 and U+1886, changed to Transparent
- as a result of their reclassification as General_Category=Mn.</li>
- </ul>
- </li>
- <li>Blocks.txt
- <ul>
- <li>A total of 11 new blocks were added, including blocks for the six new scripts and supplemental blocks for three existing scripts, Cyrillic, Glagolitic, and Mongolian.</li>
- <li>The largest script by far in Unicode 9.0, Tangut, spans two dedicated blocks and one character from another new block, Ideographic Symbols and Punctuation.</li>
- </ul>
- </li>
- <li>EastAsianWidth.txt
- <ul>
- <li>The pictographic symbols which have the Emoji_Presentation property as of <a href="../tr51/tr51-7.html">Version 3.0</a>
- of Unicode Technical Report #51, "Unicode Emoji", with the exception of regional indicators, U+1F1E6..U+1F1FF,
- were assigned the East_Asian_Width property value Wide [<a href="../tr41/tr41-21.html#UTR51">UTR51</a>].
- This assignment includes both existing and newly encoded symbols, and ensures consistent treatment of emoji as Wide characters.</li>
- <li>All of the Tangut characters—ideographs, components, and the iteration mark U+16FE0—were assigned the East_Asian_Width property value Wide.</li>
- <li>Most of the other new characters were assigned the East_Asian_Width property value Neutral.</li>
- </ul>
- </li>
- <li>IndicPositionalCategory.txt
- <ul>
- <li>Entries were added for the matras and non-vocalic marks of the three Brahmi-derived scripts introduced in Unicode 9.0—Bhaiksuki, Marchen, and Newa.</li>
- <li>A newly encoded combining mark used with Newa, U+1DFB, was specifically given an Indic_Positional_Category property value.</li>
- <li>Two new marks added to Khojki and Saurashtra were also given Indic_Positional_Category property values.</li>
- </ul>
- </li>
- <li>IndicSyllabicCategory.txt
- <ul>
- <li>Characters in the three newly encoded Brahmi-derived scripts, as well as new characters of existing Indic scripts,
- including Malayalam chillus and Khojki and Saurashtra marks, were added with appropriate property values.</li>
- <li>The rule used to derive the set of characters with the Indic_Syllabic_Category property value Nukta was updated
- to exclude U+1E94A ADLAM NUKTA, as Adlam is not a Brahmi-derived script.</li>
- <li>A few previously encoded Khmer and Myanmar characters, such as the Khamti Shan logograms U+AA74..U+AA76,
- were also assigned specific Indic_Syllabic_Category property values.</li>
- </ul>
- </li>
- <li>LineBreak.txt
- <ul>
- <li>Three new Line_Break property values were introduced, in conjunction with algorithm rules,
- to ensure that the various types of character sequences that represent emoji are handled as indivisible units in line breaking
- [<a href="../tr41/tr41-21.html#UAX14">UAX14</a>, <a href="../tr41/tr41-21.html#UTR51">UTR51</a>].</li>
- <li>Two of the new property values were assigned to characters based on the classification of emoji characters in UTR #51:
- Line_Break=E_Base to the symbols with the UTR #51 binary property Emoji_Modifier_Base,
- and Line_Break=E_Modifier to the characters with the UTR #51 binary property Emoji_Modifier, which consists of the range U+1F3FB..U+1F3FF.
- The affected characters are both existing and new in Unicode 9.0.
- The existing characters that became Line_Break=E_Base had all been Line_Break=Ideographic,
- and the five characters that became Line_Break=E_Modifier had all been Line_Break=Alphabetic.</li>
- <li>The Line_Break property value of U+200D ZERO WIDTH JOINER changed from Combining_Mark to ZWJ,
- the third new Line_Break property value, assigned solely to U+200D.</li>
- <li>For forward compatibility, all of the unassigned code points in the range U+1F000..U+1FFFD,
- whether inside or outside of allocated blocks, were given the default Line_Break property value Ideographic.
- These default values allow better interoperability between applications that support emoji as of different versions of Unicode.</li>
- <li>The Line_Break property values of the halfwidth Katakana and Hangul jamo variants
- in the Halfwidth and Fullwidth Forms block changed from Alphabetic to Ideographic,
- to match the established line breaking behavior of those characters in existing implementations.</li>
- <li>The Line Break property value of the Mongolian baluda characters, U+1885 and U+1886,
- changed from Alphabetic to Combining_Mark as a result of their reclassification as General_Category=Mn.</li>
- <li>The Line_Break property value of U+2764 HEAVY BLACK HEART changed from Alphabetic to Ideographic,
- as a result of its addition to the set of characters with the UTR #51 binary property Emoji.</li>
- <li>Newly encoded characters were assigned appropriate Line_Break property values.</li>
- </ul>
- </li>
- <li>NamedSequences.txt
- <ul>
- <li>Comment lines were spliced in, documenting the named character sequences that had been included
- in the original set of sequences published in Unicode 4.1.</li>
- </ul>
- </li>
- <li>NamedSequencesProv.txt
- <ul>
- <li>The set of 12 named sequences that represent keycaps, used for emoji, remained provisional
- and were modified to include an explicit emoji variation selector U+FE0F in each sequence.
- The insertion was made in accordance with UTR #51, which states that emoji variation selectors
- are used to control the presentation style of emoji characters that have a default text presentation.</li>
- </ul>
- </li>
- <li>NamesList.txt
- <ul>
- <li>Content was updated throughout with new characters, as well as annotations, cross references, subheadings, and remarks.</li>
- </ul>
- </li>
- <li>PropertyAliases.txt
- <ul>
- <li>The long name alias of the binary property STerm was redefined to Sentence_Terminal,
- for name clarity and disambiguation from the Sentence_Break property value STerm.
- Because the short and long name aliases of the binary property had been identical,
- the redefinition of the long alias is equivalent to the introduction of an additional alias.</li>
- <li>An entry was added for the newly defined binary property, Prepended_Concatenation_Mark, abbreviated PCM.</li>
- </ul>
- </li>
- <li>PropertyValueAliases.txt
- <ul>
- <li>The 9.0 value was added to the catalog property Age.</li>
- <li>Script and Block property values were added for the six new scripts and 11 new blocks introduced.</li>
- <li>Entries were added for the new Line_Break, Grapheme_Cluster_Break, and Word_Break property values
- introduced in the corresponding line breaking and text segmentation algorithms for handling emoji sequences.</li>
- <li>Entries were added for the three new Joining_Group property values introduced with the Arabic letters
- U+08BB..U+08BD, used for Warsh orthography.</li>
- <li>A new section was added for the values of the newly defined binary property Prepended_Concatenation_Mark.</li>
- <li>The comment line marking the section for the binary property STerm was updated with the new long property name alias Sentence_Terminal.</li>
- </ul>
- </li>
- <li>PropList.txt
- <ul>
- <li>Most of the newly encoded combining marks were assigned either the contributory property Other_Alphabetic
- or the binary property Diacritic, as appropriate.</li>
- <li>Newly encoded punctuation characters that mark the end of various sections of text, such as dandas,
- were assigned the appropriate binary properties Terminal_Punctuation or Sentence_Terminal,
- with the latter using the new long name alias instead of STerm.</li>
- <li>The Mongolian baluda characters U+1885..U+1886, which were reclassified from General_Category=Lo to Mn,
- were assigned the contributory properties Other_Alphabetic and Other_ID_Start.
- These assignments were made to preserve the Alphabetic and ID_Start properties of the two characters.
- In particular, the preservation of the ID_Start property is dictated by the stability guarantees for Unicode identifiers.</li>
- <li>The newly encoded Tangut ideographs and components were assigned the Ideographic property (but not the Unified_Ideograph property).</li>
- <li>The Tangut iteration mark U+16FE0 and a few Adlam combining marks were assigned the binary property Extender.</li>
- <li>The stateful tag terminator U+E007F CANCEL TAG, formerly deprecated, was reinstated to non-deprecated, for use in emoji contexts.</li>
- <li>A section was added for the set of characters with the newly defined binary property Prepended_Concatenation_Mark.
- The characters with this property, such as U+0600 ARABIC NUMBER SIGN, are also referred to as
- prefixed format control characters or loosely as subtending marks.</li>
- <li>The contributory property Other_Grapheme_Extend was assigned to the tag characters U+E0020..U+E007F
- and was removed for U+200D ZERO WIDTH JOINER (ZWJ). These changes were made to preserve equality between
- the sets of characters with the property values Grapheme_Cluster_Break=Extend and Grapheme_Extend=Y,
- after the addition of tag characters to, and the removal of ZWJ from, the former set.</li>
- </ul>
- </li>
- <li>Scripts.txt
- <ul>
- <li>The new characters were assigned appropriate Script property values, including six new values for
- the newly encoded scripts: Adlam, Bhaiksuki, Marchen, Newa, Osage, and Tangut.</li>
- <li>The newly encoded emoji were assigned the Script property value Common, in a manner consistent with
- similar characters encoded previously.</li>
- <li>There were no changes of Script property values for any existing characters.</li>
- </ul>
- </li>
- <li>ScriptExtensions.txt
- <ul>
- <li>The Script_Extensions property values of over 200 ideographic symbols, which used to contain multiple Script values
- such as Bopomofo, Hangul, Hiragana, Katakana, as well as Han, were reduced to single-script set values, Script_Extensions={Han}.
- See the resolution of <a href="http://www.unicode.org/review/pri316/">PRI #316</a>.</li>
- <li>As Adlam can use U+0640 ARABIC TATWEEL in the cursive form of the script to graphically extend words,
- the Script_Extensions property value of U+0640 was updated to include the Script value Adlam.</li>
- <li>The Script value Kannada was added to the Script_Extensions property values of the North Indic
- fraction signs U+A830..U+A835, attested in Kannada texts.</li>
- <li>The Script_Extensions property values of the Aegean numeral symbols U+10107..U+10133 were updated to include the Script value Linear_A.</li>
- <li>The Script_Extensions property values of other characters used in multiple scripts were updated accordingly.</li>
- </ul>
- </li>
- <li>StandardizedVariants.txt
- <ul>
- <li>A total of 278 emoji variation sequences were added to complete the set of text and emoji presentations
- for all pictographic symbols identified as having a default text presentation [<a href="../tr41/tr41-21.html#UTR51">UTR51</a>].</li>
- <li>Standardized variation sequences were added to complete the set of dotted forms of Myanmar letters for
- Khamti, Aiton, and Phake, to distinguish them from the Burmese and Shan styles. One of the sequences has
- a spacing combining mark as the initial character of the sequence: <U+1031, U+FE00>.</li>
- <li>A standardized variation sequence was added for the slashed-zero form of the empty set symbol, U+2205.
- A separate standardized variation sequence was added for the form with short diagonal stroke of digit 0,
- U+0030, to avoid misuse of the previous sequence for the variant form of the digit.</li>
- </ul>
- </li>
- <li>TangutSources.txt
- <ul>
- <li>This new data file was added to the UCD. It contains source mappings for Tangut ideographs and components,
- as well as radical-stroke data for the ideographs, in the same format as the Unihan data files.</li>
- </ul>
- </li>
- <li>UnicodeData.txt
- <ul>
- <li>Entries were added for the newly encoded characters, including case pairs and cased letters which form
- case pairs with previously encoded letters.</li>
- <li>The additions include 9 historic Cyrillic letters, U+1C80..U+1C88, which have asymmetric case mappings
- to existing uppercase letters, similar to the asymmetric case mapping of Greek final sigma to capital sigma.</li>
- <li>The additions also include a range of Tangut ideographs, U+17000..U+187EC, which uses the same syntax
- as that for large ranges of characters with algorithmically derived names. For Tangut ideographs, the
- derived names are TANGUT IDEOGRAPH-17000 through TANGUT IDEOGRAPH-187EC.</li>
- <li>Among the new nonspacing combining marks, there are 63 which have nonzero Canonical_Combining_Class values.</li>
- <li>One new character, 1F23B SQUARED CJK UNIFIED IDEOGRAPH-914D, has a nontrivial compatibility decomposition mapping.</li>
- <li>The Mongolian baluda characters, U+1885 and U+1886, were reclassified as General_Category=Mn,
- and their Bidi_Class property was updated to Nonspacing_Mark, accordingly.</li>
- </ul>
- </li>
- </ul>
- <h4>Unihan Database (Unihan.zip)</h4>
- <ul>
- <li>Unihan_DictionaryIndices.txt
- <ul>
- <li>Dictionary index data was added for 196 ideographs from the CJK Unified Ideographs Extension E block,
- for the first time since the encoding of Extension E in Unicode 8.0.</li>
- </ul>
- </li>
- <li>Unihan_DictionaryLikeData.txt
- <ul>
- <li>The total stroke count values for the 5,771 CJK unified ideographs encoded in Unicode 8.0, which had been missing from Unihan,
- were entirely populated: 9 ideographs in the main CJK Unified Ideographs block and the rest comprising all of the assigned characters
- in the CJK Unified Ideographs Extension E block.</li>
- <li>A few other total stroke count values were corrected, and one kCihaiT field value was added.</li>
- </ul>
- </li>
- <li>Unihan_IRGSources.txt
- <ul>
- <li>A total of 2,828 kIRG_JSource fields were updated to use the latest source references from the Japanese Industrial Standard
- JIS X 0213:2004, instead of the corresponding legacy references from JIS X 0212-1990 and from Unified Japanese IT Vendors Contemporary Ideographs.</li>
- <li>The values of the residual stroke counts in the kRSUnicode fields of 20 CJK unified ideographs were changed from 0 to negative values.
- A negative value indicates that strokes which would normally constitute the indexing radical are intentionally missing.</li>
- <li>A few kIRG_GSource, kIRG_MSource, and kIRG_USource field values were added, and a couple of kIRG_GSource and kIRG_KSource field values were removed.</li>
- <li>The kRSUnicode fields of a small number of other ideographs were also updated with corrections or additional values.</li>
- </ul>
- </li>
- <li>Unihan_RadicalStrokeCounts.txt
- <ul>
- <li>The kRSKangXi fields of the same CJK unified ideographs whose kRSUnicode fields were changed in Unihan_IRGSources.txt
- (with the exception of one Extension E ideograph, U+2C09B) were similarly changed in Unihan_RadicalStrokeCounts.txt.</li>
- </ul>
- </li>
- <li>Unihan_Readings.txt
- <ul>
- <li>Over 600 kMandarin readings and over 100 kHanyuPinlu field values were updated.</li>
- <li>A few kDefinition and kHangul fields were revised, and a couple of kMandarin and kCantonese readings were added.</li>
- </ul>
- </li>
- <li>Unihan_Variants.txt
- <ul>
- <li>A small number of variant relationship mappings were added or updated.</li>
- </ul>
- </li>
- </ul>
- <h4>Data for UAX #45</h4>
- <ul>
- <li>USourceData.txt
- <ul>
- <li>A total of 1,768 unencoded CJK ideographs were added as U-Source ideographs, with the identifiers UTC-01202 through UTC-02968 and UCI-02969.</li>
- </ul>
- </li>
- <li>USourceGlyphs.pdf
- <ul>
- <li>Glyphs were added for the 1,768 U-Source ideographs introduced in USourceData.txt.</li>
- </ul>
- </li>
- </ul>
- <h4>Conformance Test Data</h4>
- <ul>
- <li>BidiCharacterTest.txt
- <ul>
- <li>Tests were added covering the edge cases of the Unicode Bidirectional Algorithm,
- which were subject to changes and clarifications made in Unicode 8.0, described in detail in
- the background document of <a href="http://www.unicode.org/review/pri279/">PRI #279</a>.</li>
- <li>A few other test cases were added for verifying the resolution of deeply nested bracket pairs,
- at the boundary conditions when the number of nested pairs reaches and exceeds the fixed capacity of the bracket stack.</li>
- </ul>
- </li>
- <li>NormalizationTest.txt
- <ul>
- <li>Test cases were added with sequences exercising all newly encoded characters which are nonspacing combining marks
- with nonzero Canonical_Combining_Class values.</li>
- <li>One test case was added with a sequence containing the single newly encoded character which has a
- nontrivial compatibility decomposition mapping, U+1F23B SQUARED CJK UNIFIED IDEOGRAPH-914D.</li>
- <li>Two extra test cases were added, consisting of character sequences with conjoining Hangul jamo and precomposed Hangul syllables.</li>
- </ul>
- </li>
- </ul>
- <h4>Auxiliary Data for UAX #14 and UAX #29</h4>
- <ul>
- <li>GraphemeBreakProperty.txt
- <ul>
- <li>The Grapheme_Cluster_Break class Prepend, previously empty, was populated with a set of characters,
- according to its new derivation expression [<a href="../tr41/tr41-21.html#UAX29">UAX29</a>].
- The set includes the characters with the newly defined binary property Prepended_Concatenation_Mark,
- which used to be Grapheme_Cluster_Break=Control, as well as a few other characters with the
- Indic_Syllabic_Category property values Consonant_Preceding_Repha and Consonant_Prefixed.</li>
- <li>The newly encoded combining marks were assigned the Grapheme_Cluster_Break property values
- Extend or SpacingMark, largely by derivation from their General_Category property values.</li>
- <li>The Mongolian baluda characters, U+1885 and U+1886, became Grapheme_Cluster_Break=Extend also
- by derivation, following their reclassification as General_Category=Mn.</li>
- <li>The tag characters U+E0020..U+E007F, all of them non-deprecated as of Unicode 9.0, were moved from
- the Grapheme_Cluster_Break class Control to Extend.</li>
- <li>U+200D ZERO WIDTH JOINER, formerly Grapheme_Cluster_Break=Extend, formed a new class by itself,
- Grapheme_Cluster_Break=ZWJ. The new property value is used in the Grapheme Cluster Boundary Algorithm
- for the handling of emoji zwj sequences defined in UTR #51 as indivisible units [<a href="../tr41/tr41-21.html#UTR51">UTR51</a>].</li>
- <li>The pictographic symbols with the UTR #51 binary property Emoji_Modifier_Base formed two newly defined
- Grapheme_Cluster_Break classes, E_Base and E_Base_GAZ. The partitioning is determined by the additional
- presence or absence of those characters in the set of emoji zwj sequences defined in UTR #51.</li>
- <li>Other pictographic symbols that appear in emoji zwj sequences (after ZWJ) but do not have the UTR #51
- binary property Emoji_Modifier_Base formed an additional new class, Grapheme_Cluster_Break=Glue_After_Zwj.</li>
- <li>The characters with the UTR #51 binary property Emoji_Modifier formed the last emoji-related,
- newly defined Grapheme_Cluster_Break class E_Modifier.</li>
- </ul>
- </li>
- <li>GraphemeBreakTest.txt
- <ul>
- <li>Test cases were added exercising the newly populated Grapheme_Cluster_Break class Prepend.</li>
- <li>Test cases were added exercising the newly defined emoji-related Grapheme_Cluster_Break property values
- E_Base, E_Base_GAZ, Glue_After_Zwj, and E_Modifier, also in combinations with the newly factored out ZWJ.</li>
- <li>Test cases were updated to illustrate grapheme cluster boundaries in sequences of regional indicator
- characters, according to the revised Grapheme Cluster Boundary Algorithm: in sequences of more than two,
- regional indicators are kept together in pairs.</li>
- <li>The rule numbers reported in the test results were updated according to the revised Grapheme Cluster Boundary Algorithm.</li>
- </ul>
- </li>
- <li>LineBreakTest.txt
- <ul>
- <li>Many test cases were added exercising the newly defined emoji-related Line_Break property values
- E_Base and E_Modifier, as well as ZWJ.</li>
- <li>The expected test results were updated according to the revised rules of the Unicode Line Breaking Algorithm.</li>
- </ul>
- </li>
- <li>SentenceBreakProperty.txt
- <ul>
- <li>Newly encoded characters were assigned the Sentence_Break property values Extend, Format, Lower,
- Numeric, OLetter, STerm, or Upper, by derivation from their primary property values.</li>
- <li>The Sentence_Break property values of the Mongolian baluda characters, U+1885 and U+1886, changed from
- OLetter to Extend also by derivation, following their reclassification as General_Category=Mn.</li>
- <li>The tag characters U+E0020..U+E007F were moved from the Sentence_Break class Format to Extend.</li>
- </ul>
- </li>
- <li>SentenceBreakTest.txt
- <ul>
- <li>The rule numbers reported in the test results were updated, to reflect the renumbering of one rule
- of the Sentence Boundary Algorithm.</li>
- <li>A few test cases were added, removed, or reordered.</li>
- </ul>
- </li>
- <li>WordBreakProperty.txt
- <ul>
- <li>Newly encoded characters were assigned the Word_Break property values ALetter, Extend, Format, or Numeric,
- by derivation from other property values.</li>
- <li>The Word_Break property values of the Mongolian baluda characters, U+1885 and U+1886, changed from
- ALetter to Extend also by derivation, following their reclassification as General_Category=Mn.</li>
- <li>The tag characters U+E0020..U+E007F, were moved from the Word_Break class Format to Extend.</li>
- <li>The newly introduced Word_Break property values related to emoji—E_Base, E_Base_GAZ, Glue_After_Zwj, and
- E_Modifier—were assigned to the same sets of pictographic symbols as the similarly named Grapheme_Cluster_Break property values were.</li>
- <li>U+200D ZERO WIDTH JOINER formed a new class by itself, Word_Break=ZWJ, also similar to the Grapheme_Cluster_Break reclassification of U+200D.</li>
- <li>The Word_Break property value of U+202F NARROW NO-BREAK SPACE changed from the default Other to ExtendNumLet.</li>
- </ul>
- </li>
- <li>WordBreakTest.txt
- <ul>
- <li>Test cases were added exercising the newly defined emoji-related Word_Break property values
- E_Base, E_Base_GAZ, Glue_After_Zwj, and E_Modifier, also in combinations with the newly factored out ZWJ.</li>
- <li>Test cases were updated to illustrate word boundaries in sequences of regional indicator characters,
- according to the revised Word Boundary Algorithm.</li>
- </ul>
- </li>
- </ul>
- <h4>Documentation for Auxiliary Data</h4>
- <ul>
- <li>GraphemeBreakTest.html
- <ul>
- <li>The pair table was updated to include the five newly defined Grapheme_Cluster_Break property values—E_Base,
- E_Base_GAZ, Glue_After_Zwj, E_Modifier, and ZWJ—as well as the existing but now populated class Prepend.</li>
- <li>The test rules were updated to match those in the Grapheme Cluster Boundary Algorithm as defined in
- Unicode Standard Annex #29, "Unicode Text Segmentation" [<a href="../tr41/tr41-21.html#UAX29">UAX29</a>].</li>
- <li>The sample test cases were updated and a few more were added.</li>
- </ul>
- </li>
- <li>LineBreakTest.html
- <ul>
- <li>The pair table was updated to include the three newly defined Line_Break property values—E_Base, E_Modifier, and ZWJ.</li>
- <li>The test rules were updated as a result of the changes in the Unicode Line Breaking Algorithm [<a href="../tr41/tr41-21.html#UAX14">UAX14</a>].</li>
- <li>Several sample test cases were added.</li>
- </ul>
- </li>
- <li>SentenceBreakTest.html
- <ul>
- <li>One rule of the Sentence Boundary Algorithm was renumbered, reflecting the same change made in UAX #29.</li>
- <li>A few sample test cases were added, removed, or reordered.</li>
- </ul>
- </li>
- <li>WordBreakTest.html
- <ul>
- <li>The pair table, test rules, and sample test cases were updated in a manner similar to the corresponding updates made in GraphemeBreakTest.html.</li>
- </ul>
- </li>
- </ul>
- <hr>
- <h2 class="nonumber"><a name="Acknowledgments" href="#Acknowledgments">Acknowledgments</a></h2>
- <p>Mark Davis and Ken Whistler are the authors of the initial version and have added to and
- maintained the text of this annex. Laurențiu Iancu assisted
- in the documentation of UCD changes for Versions 6.3.0 through 10.0.0. Julie Allen and Asmus Freytag provided editorial
- suggestions for improvement of the text. Over the years, many
- members of the UTC have participated in the review of the UCD
- and its documentation.</p>
- <h2 class="nonumber"><a name="References" href="#References">References</a></h2>
- <p>For references for this annex, see Unicode Standard Annex #41, "<a href="../tr41/tr41-21.html">Common
- References for Unicode Standard Annexes</a>."</p>
-
- <h2 class="nonumber"><a name="Modifications" href="#Modifications">Modifications</a></h2>
-
- <p>The following summarizes modifications from previous revisions of this
- annex.</p>
- <h3>Revision 20 [KW, LI]</h3>
- <ul>
- <li><b>Reiussed</b> for Unicode 10.0.0.</li>
- <li>Removed old UCD Change History entry for Unicode 8.0.0, and added new one for Unicode 10.0.0.</li>
- <li>Updated the description of the <a href="#Name">Name</a> property value.</li>
- <li>Updated the discussion of immutable properties and the list of those properties in
- <a href="#Immutable_Properties_Table">Table 19</a>.</li>
- <li>Added a new Table 10a, <a href="#Contributory_Properties_Table">Contributory Properties</a>
- in Section 5.5.</li>
- <li>Added a row to Table 5, <a href="#UCD_Files_Table">Files in the UCD</a> for
- NushuSources.txt. Tweaked content elsewhere to account for this new addition.</li>
- <li>Added new Section 5.13 <a href="#Property_APIs">Property APIs</a>.</li>
- <li>Updated Table 9, <a href="#Property_List_Table">Property Table</a> to show
- that the <a href="#Ideographic">Ideographic</a> property, rather than the
- Unified_Ideograph property, is now used in the definition of Ideographic Description
- Sequences.</li>
- <li>Added entry for the <a href="#Vertical_Orientation">Vertical_Orientation</a>
- and <a href="#Regional_Indicator">Regional_Indicator</a> properties
- in Table 9, <a href="#Property_List_Table">Property Table</a>.</li>
- <li>Adjusted the discussion of the <a href="#Block">Block</a> property in
- Table 9, <a href="#Property_List_Table">Property Table</a>.</li>
- <li>Added default value for the <a href="#Vertical_Orientation">Vertical_Orientation</a> property
- in Table 4, <a href="#Default_Values_Table">Default Values for Properties</a>
- and added an indication that the default values for Vertical_Orientation are complex.</li>
- <li>Added discussion of new data file DerivedName.txt to Section 5.4,
- <a href="#Derived_Extracted">Derived Extracted Properties</a>.</li>
- <li>Added new Section 2.1.3, <a href="#Props_External">Properties Dependent on External
- Specifications</a> to discuss the dependency of UCD segmentation properties on the
- non-UCD emoji properties.</li>
- <li>Added new Section 5.14, <a href="#Character_Age">Character Age</a> to further explain
- the details of the Age property and its derivation.</li>
- <li>Added column indicating which default values are complex in
- Table 4. <a href="#Default_Values_Table">Default Values for Properties</a>.</li>
- <li>Updated various mentions of "U-Source ideographs" to "UTC-Source ideographs".</li>
- </ul>
- <p>Revision 19 being a proposed update, only changes between revisions 20 and
- 18 are noted here.</p>
- <h3>Revision 18 [KW, LI]</h3>
- <ul>
- <li><b>Reissued</b> for Unicode 9.0.0.</li>
- <li>Removed old UCD Change History entry for Unicode 7.0.0, and added new one for Unicode 9.0.0.</li>
- <li>Updated Section 3.4 <a href="#StandardizedVariants">StandardizedVariants.html</a> to
- document the obsolescence of that file and the alternative means now available for
- displaying reference glyphs for standardized variants.</li>
- <li>Added new Section 3.5 <a href="#EmojiVariants">Emoji Variation Sequences</a> to
- document the page on the emoji subsite showing the glyphs for the emoji variation sequences.</li>
- <li>Updated documentation for <a href="#STerm">Sentence_Terminal</a> to use the long alias.</li>
- <li>Updated documentation for <a href="#Ideographic">Ideographic</a> and
- <a href="#Unified_Ideograph">Unified_Ideograph</a> to clarify their relationship.</li>
- <li>Added a row to Table 5, <a href="#UCD_Files_Table">Files in the UCD</a> for
- TangutSources.txt. Tweaked content elsewhere to account for this new addition.</li>
- <li>Added clarification in Section 5.7.5
- <a href="#Decompositions_and_Normalization">Decompositions and Normalization</a>
- regarding which normalization-related properties should or should not be exported
- in an API.</li>
- <li>Added note in Section 5.12 <a href="#Deprecation">Deprecation</a>
- indicating that deprecated properties are not recommended for support in APIs.</li>
- <li>Added documentation for <a href="#Prepended_Concatenation_Mark">Prepended_Concatenation_Mark</a>.</li>
- <li>Updated statement about default values for the Line_Break property in
- Section 4.2.9 <a href="#Default_Values">Default Values</a>.</li>
- </ul>
- <p>Revision 17 being a proposed update, only changes between revisions 18 and
- 16 are noted here.</p>
- <h3>Revision 16 [KW, LI]</h3>
- <ul>
- <li><b>Reissued</b> for Unicode 8.0.0.</li>
- <li>Removed old UCD Change History entry for Unicode 6.3.0, and added new one for Unicode 8.0.0.</li>
- <li>Clarified the intent for the information contained in <a href="#Property_List_Table">Table 9</a>
- in Section 5.3 <a href="#Property_Definitions">Property Definitions</a>.</li>
- <li>Updated table styles.</li>
- <li>Renamed Indic_Matra_Category to <a href="#Indic_Positional_Category">Indic_Positional_Category</a>, with corresponding change in the file name.</li>
- <li>Changed <a href="#Indic_Syllabic_Category">Indic_Syllabic_Category</a> and the renamed
- <a href="#Indic_Positional_Category">Indic_Positional_Category</a> from Provisional to Informative status.</li>
- <li>Added information about location of UCD.zip and the URL for zipped/latest.</li>
- </ul>
-
- <p>Revision 15 being a proposed update, only changes between revisions 16 and
- 14 are noted here.</p>
- <h3>Revision 14 [KW, LI]</h3>
- <ul>
- <li><b>Reissued</b> for Unicode 7.0.0.</li>
- <li>Removed old UCD Change History entry for Unicode 6.2.0, and added new one for Unicode 7.0.0.</li>
- <li>Updated chapter references for Unicode 7.0.0.</li>
- <li>Updated the derivation of the <a href="#Alphabetic">Alphabetic</a> property.</li>
- <li>Updated the derivation of the <a href="#Case_Ignorable">Case_Ignorable</a> property.</li>
- <li>Simplified the discussion of @missing in Section 4.2.10 <a href="#Missing_Conventions">@missing Conventions</a>,
- to reflect the revised conventions in the UCD data files, which eliminated special edge cases.</li>
- <li>Corrected statement about aliases for provisional properties in Section 5.8
- <a href="#Property_Aliases">Property and Property Value Aliases</a>.</li>
- <li>Minor editing.</li>
- </ul>
- <p>Revision 13 being a proposed update, only changes between revisions 14 and
- 12 are noted here.</p>
- <h3>Revision 12 [KW, LI]</h3>
- <ul>
- <li><b>Reissued</b> for Unicode 6.3.0.</li>
- <li>Removed old UCD Change History entry for Unicode 6.1.0, and added new one for Unicode 6.3.0.</li>
- <li>Added a clarification about <a href="#Numeric_Type">Numeric_Type</a>=Digit.</li>
- <li>Added documentation of default values for Line_Break, added additional default values
- for Bidi_Class, and clarified the usage of @missing in Section 4.2.9 <a href="#Default_Values">Default Values</a>.</li>
- <li>Added new Section 4.2.10 <a href="#Missing_Conventions">@missing Conventions</a>, to spell out
- syntax and other issues for @missing lines in more detail.</li>
- <li>Clarified the status of default values in Section 5.4 <a href="#Derived_Extracted">Derived Extracted Properties</a>.</li>
- <li>Added information about the derived status of kCompatibilityVariant in Section 5.7.3
- <a href="#Character_Decomposition_Mappings">Character Decomposition Mapping</a>.</li>
- <li>Added an entry for BidiBrackets.txt and two new bidi properties to <a href="#Property_List_Table">Table 9. Property Table</a>
- and relevant links elsewhere.</li>
- <li>Added BidiCharacterTest.txt to the list of test data files and provided a brief description of its contents in
- Section 6.3 <a href="#BidiTest_txt">Bidirectional Test Files</a>.</li>
- <li>Added new isolate controls to <a href="#BC_Values_Table">Table 13. Bidi_Class Values</a> and reordered
- entries to match the listing in UAX #9.</li>
- <li>Added documentation about the new permalink for the latest UCD release, in Section 4.1
- <a href="#Directory_Structure">Directory Structure</a>.</li>
- </ul>
-
- <p>Revision 11 being a proposed update, only changes between revisions 12 and
- 10 are noted here.</p>
- <h3>Revision 10 [KW]</h3>
- <ul>
- <li><b>Reissued</b> for Unicode 6.2.0.</li>
- <li>Removed old UCD Change History entry for Unicode 6.0.0, and added new one for Unicode 6.2.0.</li>
- <li>Updated status of <a href="#Script_Extensions">Script_Extensions</a> to informative.</li>
- <li>Updated type of <a href="#Bidi_Mirroring_Glyph">Bidi_Mirroring_Glyph</a>
- from String to Miscellaneous.</li>
- <li>Marked <a href="#Unicode_1_Name">Unicode_1_Name</a> as Obsolete and updated its documentation.</li>
- <li>Added text indicating that the UTC must approve any change to normative or informative
- property values, in Section 2.3.1 <a href="#Allowed_Changes">Changes to Properties Between Releases</a>.</li>
- <li>Corrected numbering error for Section 2.3.4 <a href="#Stabilized_Properties">Stabilized Properties</a>.</li>
- <li>Updated the note about NamesList.txt being encoded in Latin-1, because starting with Version 6.2.0, it
- is encoded in UTF-8. See Section 4.2.11 <a href="#Text_Encoding">Text Encoding</a>.</li>
- <li>Added indication that ccc=133 is reserved in Section 5.11.2
- <a href="#Validation_of_CCC">Combining_Character_Class Property</a>.</li>
- <li>Added Section 3.6 <a href="#USource">U-Source Ideographs and UAX #45</a>.</li>
- <li>Added entries to <a href="#UCD_Files_Table">Table 5</a> for USourceData.txt and USourceGlyphs.pdf.</li>
- <li>Removed entry for ScriptExtensions.txt from <a href="#UCD_Files_Table">Table 5</a>.</li>
- </ul>
-
- <p>Revision 9 being a proposed update, only changes between revisions 10 and
- 8 are noted here.</p>
- <h3>Revision 8 [KW]</h3>
- <ul>
- <li><b>Reissued</b> for Unicode 6.1.0.</li>
- <li>Removed old UCD Change History entry for Unicode 5.2.0, and added new one for Unicode 6.1.0.</li>
- <li>Added details of data file changes for Unicode 6.1.0.</li>
- <li>Updated derivation of <a href="#Default_Ignorable_Code_Point">Default_Ignorable_Code_Point</a> to account for U+0604.</li>
- <li>Added a clarification about empty field values in data files for string properties
- in a new Section 4.2.10 <a href="#Empty_Fields">Empty Fields</a>.</li>
- <li>Added a warning about matching alternative, non-standard names in Section 5.9
- <a href="#Matching_Rules">Matching Rules</a>.</li>
- <li>Added new Section 4.2.8 <a href="#Multiple_Values">Multiple Values for Properties</a>.</li>
- <li>Added new Section 5.7.6 <a href="#Property_Values_As_Sets">Properties Whose Values Are Sets of Values</a>.</li>
- <li>Added documentation of symbolic labels for fixed position canonical combining classes
- in <a href="#CCC_Values_Table">Table 15</a>.</li>
- <li>Updated wording regarding addition of new property values in Section 5.10 <a href="#Invariants">Invariants</a>.
- <li>Corrected URL for the Resolved PRI page reference.</li>
- <li>Added a paragraph about aliases of the form "Ccc10" for fixed position classes
- in <a href="#Canonical_Combining_Class_Values">Canonical Combining Class Values</a>.</li>
- <li>Clarified the current status of the "n/a" metavalue for PropertyValueAliases.txt, in
- <a href="#Property_Aliases">Property and Property Value Aliases</a>.</li>
- <li>Updated regex in <a href="#Common_Subexpressions_Table">Table 20</a> and <a href="#Regular_Expressions_Table">Table 21</a>.</li>
- <li>Updated the description of the <a href="#Name_Alias">Name_Alias</a> property, to account for new types of formal name
- aliases now included in NameAliases.txt.</li>
- <li>Added new Section 5.11.5 <a href="#Validation_of_Multivalued">Validation of Multivalued Properties</a>.</li>
- <li>Added new entry for <a href="#Script_Extensions">Script_Extensions</a> in the Property Table.</li>
- <li>Updated <a href="#Invariants_in_Implementations">Invariants in Implementations</a> and related
- sections to reflect change in range for Canonical_Combining_Class from 0..255 to 0..254.</li>
- <li>Added note to <a href="#Validation_of_CCC">Combining_Character_Class Property</a> regarding
- implementation use of reserved value 255.</li>
- <li>Added a gray background to entries for contributory properties in the
- <a href="#Property_Index">Property Index</a>.</li>
- <li>Added documentation regarding abbreviations and long aliases for General_Category groupings
- in <a href="#GC_Values_Table">Table 12. General_Category Values</a>.</li>
- <li>Corrected several numerical references to definitions related to casing properties in
- <a href="#Property_List_Table">Table 9. Property Table</a>.</li>
- <li>Added information regarding longest canonical and compatibility mappings in
- <a href="#Character_Decomposition_Mappings">5.7.3 Character Decomposition Mapping</a>.</li>
- <li>Updated status of Grapheme_Base and Grapheme_Extend to normative and corrected their
- descriptions in <a href="#Property_List_Table">Table 9. Property Table</a>.</li>
- <li>Added clarification regarding edge case treatment for Other_Punctuation,
- Other_Symbol, etc. in <a href="#General_Category_Values">5.7.1 General Category Values</a></li>
- <li>Added a description and example of the form of derived property definitions in
- <a href="#Simple_Derived">2.1 Simple and Derived Properties</a>.</li>
- <li>Various small editorial fixes.</li>
- </ul>
-
- <p>Revision 7 being a proposed update, only changes between revisions 8 and
- 6 are noted here.</p>
- <h3>Revision 6 [KW]</h3>
- <ul>
- <li><b>Reissued</b> for Unicode 6.0.0.</li>
- <li>Removed old UCD Change History entries prior to Unicode 5.2.0.</li>
- <li>Updated status of <a href="#Hyphen">Hyphen</a> and <a href="#ISO_Comment">ISO_Comment</a> properties to Deprecated.</li>
- <li>Updated status of several derived normalization properties to Deprecated.</li>
- <li>Added tables listing <a href="#Deprecated_Property_Table">Deprecated</a> and <a href="#Stabilized_Property_Table">Stabilized</a> properties.</li>
- <li>Extended the discussion of the significance of the <a href="#Bidi_Mirroring_Glyph">Bidi_Mirroring_Glyph</a> property.</li>
- <li>Clarified the intended application of the <a href="#Ideographic">Ideographic</a>
- and <a href="#Unified_Ideograph">Unified_Ideograph</a> properties.</li>
- <li>Moved Property Summary to top of Section 5, renamed it to Property Index,
- and adjusted Section 5 numbering.</li>
- <li>Renumbered tables to account for two table insertions.</li>
- <li>Rewrote the description of the <a href="#Logical_Order_Exception">Logical_Order_Exception</a>
- and <a href="#White_Space">White_Space</a> properties for clarity.</li>
- <li>Added clarification for <a href="#UAX44-LM2">UAX44-LM2</a> in <a href="#Matching_Rules">Matching Rules</a>.</li>
- <li>Updated matching rule <a href="#UAX44-LM3">UAX44-LM3</a> to ignore initial "is" in <a href="#Matching_Rules">Matching Rules</a>.</li>
- <li>Added U+110BD to the list of exceptions to the derivation of <a href="#Default_Ignorable_Code_Point">Default_Ignorable_Code_Point</a>.</li>
- <li>Added anchors to the matching rules.</li>
- <li>Updated the description fields for <a href="#FC_NFKC_Closure">FC_NFKC_Closure</a>
- and <a href="#NFKC_Casefold">NFKC_Casefold</a>.
- <li>Added entries for EmojiSources.txt and ScriptExtensions.txt to <a href="#UCD_Files_Table">Table 5</a>.</li>
- <li>Added entries for <a href="#Indic_Syllabic_Category">Indic_Syllabic_Category</a> and
- <a href="#Indic_Matra_Category">Indic_Matra_Category</a>.</li>
- <li>Added note clarifying that aliases are not provided for provisional properties in <a href="#Property_Aliases">Section 5.8</a>.</li>
- <li>Added clarification on value ranges and other restrictions for decimal digits in
- discussion of <a href="#Numeric_Type">Numeric_Type</a>.</li>
- <li>Miscellaneous minor point edits.</li>
- </ul>
- <p>Revision 5 being a proposed update, only changes between revisions 6 and
- 4 are noted here.</p>
- <h3>Revision 4 [KW]</h3>
- <ul>
- <li><b>Reissued</b> for Unicode 5.2.0.</li>
- <li>Completely reorganized and rewritten, to include all the content
- from the obsoleted <a href="http://www.unicode.org/Public/5.1.0/ucd/UCD.html">UCD.html</a>.</li>
- <li>Added Section 5.10 re deprecation.</li>
- <li>Added subsection in Section 4.2 re line termination conventions.</li>
- <li>Added Contributory as a formal status and updated the Property Table accordingly.</li>
- <li>Added note in Section 5.3.1 to indicate that
- contributory properties are neither normative nor informative.</li>
- <li>Updated documentation for default values.</li>
- <li>Cleaned up description of numeric properties.</li>
- <li>Tweaked the description of NamesList.html.</li>
- <li>Miscellaneous minor point edits.</li>
- <li>Updated summary statement of the document.</li>
- <li>Centered tables.</li>
- <li>Added anchors and numbers to tables and adjusted text referencing tables accordingly.</li>
- <li>Added clarifications about exceptional format issues for Unihan data files.</li>
- <li>Updated references to <i>Section 4.8, Name—Normative</i> for
- derived names and for code point labels.</li>
- <li>Added mention of property aliases from Unihan data files to Section 5.6.1.</li>
- <li>Added documentation for new derived properties: Cased, Case_Ignorable,
- Changes_When_Lowercased,
- Changes_When_Uppercased, Changes_When_Titlecased, Changes_When_Casefolded, Changes_When_Casemapped,
- NFKC_Casefold, and Changes_When_NFKC_Casefolded.</li>
- <li>Added strong pointers to Section 3.5 and Chapter 4 of [Unicode] in the Introduction.</li>
- <li>Added new <i>Section 2.3.1, Changes to Properties Between Releases</i>.</li>
- <li>Updated default values for East_Asian_Width.</li>
- <li>Clarified the applicability of comments in cases where properties have multiple
- default values.</li>
- <li>Restructured Section 5.1 documentation of columns in the property table, for better
- text flow.</li>
- <li>Reordered entries for DerivedCoreProperties.txt in the property table, for clarity.</li>
- <li>Added documentation of new test file: BidiTest.txt.</li>
- <li>Updated terminology related to the Unihan Database.</li>
- <li>Added documentation for the new data file, CJKRadicals.txt.</li>
- <li>Added Attached_Above for ccc=214 in Table 13.</li>
- <li>Complete revision of Validation section and associated tables.</li>
- <li>Minor revision of text in <i>Section 4.1.5, File Directory Differences for Early Releases</i>.</li>
- <li>Added a cautionary note about the use of the Age property in regular expressions.</li>
- <li>Added sections explaining obsolete, deprecated, and stabilized properties, and
- clearly identified existing such properties in the property table.</li>
- </ul>
- <p>Revision 3 being a proposed update, only changes between revisions 4 and
- 2 are noted here.</p>
- <h3>Revision 2</h3>
- <ul>
- <li>Initial approved version for Unicode 5.1.0.</li>
- </ul>
- <h3>Revision 1</h3>
- <ul>
- <li>Initial draft.</li>
- </ul>
-
- <hr>
- <p class="copyright">© 2017 Unicode, Inc. All Rights Reserved. The Unicode
- Consortium makes no expressed or implied warranty of any kind, and assumes no liability for errors
- or omissions. No liability is assumed for incidental and consequential damages in connection with
- or arising out of the use of the information or programs contained or accompanying this technical
- report. The Unicode <a href="http://www.unicode.org/copyright.html">Terms of Use</a> apply.</p>
- <p class="copyright">Unicode and the Unicode logo are trademarks of Unicode, Inc., and are
- registered in some jurisdictions.
- </div> <!-- body -->
- </body>
- </html>
|