letter_test.go 12 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536
  1. // Copyright 2009 The Go Authors. All rights reserved.
  2. // Use of this source code is governed by a BSD-style
  3. // license that can be found in the LICENSE file.
  4. package unicode_test
  5. import (
  6. "flag"
  7. "fmt"
  8. "runtime"
  9. "sort"
  10. "testing"
  11. . "unicode"
  12. )
  13. var upperTest = []rune{
  14. 0x41,
  15. 0xc0,
  16. 0xd8,
  17. 0x100,
  18. 0x139,
  19. 0x14a,
  20. 0x178,
  21. 0x181,
  22. 0x376,
  23. 0x3cf,
  24. 0x1f2a,
  25. 0x2102,
  26. 0x2c00,
  27. 0x2c10,
  28. 0x2c20,
  29. 0xa650,
  30. 0xa722,
  31. 0xff3a,
  32. 0x10400,
  33. 0x1d400,
  34. 0x1d7ca,
  35. }
  36. var notupperTest = []rune{
  37. 0x40,
  38. 0x5b,
  39. 0x61,
  40. 0x185,
  41. 0x1b0,
  42. 0x377,
  43. 0x387,
  44. 0x2150,
  45. 0xffff,
  46. 0x10000,
  47. }
  48. var letterTest = []rune{
  49. 0x41,
  50. 0x61,
  51. 0xaa,
  52. 0xba,
  53. 0xc8,
  54. 0xdb,
  55. 0xf9,
  56. 0x2ec,
  57. 0x535,
  58. 0x620,
  59. 0x6e6,
  60. 0x93d,
  61. 0xa15,
  62. 0xb99,
  63. 0xdc0,
  64. 0xedd,
  65. 0x1000,
  66. 0x1200,
  67. 0x1312,
  68. 0x1401,
  69. 0x1885,
  70. 0x2c00,
  71. 0xa800,
  72. 0xf900,
  73. 0xfa30,
  74. 0xffda,
  75. 0xffdc,
  76. 0x10000,
  77. 0x10300,
  78. 0x10400,
  79. 0x20000,
  80. 0x2f800,
  81. 0x2fa1d,
  82. }
  83. var notletterTest = []rune{
  84. 0x20,
  85. 0x35,
  86. 0x375,
  87. 0x619,
  88. 0x700,
  89. 0xfffe,
  90. 0x1ffff,
  91. 0x10ffff,
  92. }
  93. // Contains all the special cased Latin-1 chars.
  94. var spaceTest = []rune{
  95. 0x09,
  96. 0x0a,
  97. 0x0b,
  98. 0x0c,
  99. 0x0d,
  100. 0x20,
  101. 0x85,
  102. 0xA0,
  103. 0x2000,
  104. 0x3000,
  105. }
  106. type caseT struct {
  107. cas int
  108. in, out rune
  109. }
  110. var caseTest = []caseT{
  111. // errors
  112. {-1, '\n', 0xFFFD},
  113. {UpperCase, -1, -1},
  114. {UpperCase, 1 << 30, 1 << 30},
  115. // ASCII (special-cased so test carefully)
  116. {UpperCase, '\n', '\n'},
  117. {UpperCase, 'a', 'A'},
  118. {UpperCase, 'A', 'A'},
  119. {UpperCase, '7', '7'},
  120. {LowerCase, '\n', '\n'},
  121. {LowerCase, 'a', 'a'},
  122. {LowerCase, 'A', 'a'},
  123. {LowerCase, '7', '7'},
  124. {TitleCase, '\n', '\n'},
  125. {TitleCase, 'a', 'A'},
  126. {TitleCase, 'A', 'A'},
  127. {TitleCase, '7', '7'},
  128. // Latin-1: easy to read the tests!
  129. {UpperCase, 0x80, 0x80},
  130. {UpperCase, 'Å', 'Å'},
  131. {UpperCase, 'å', 'Å'},
  132. {LowerCase, 0x80, 0x80},
  133. {LowerCase, 'Å', 'å'},
  134. {LowerCase, 'å', 'å'},
  135. {TitleCase, 0x80, 0x80},
  136. {TitleCase, 'Å', 'Å'},
  137. {TitleCase, 'å', 'Å'},
  138. // 0131;LATIN SMALL LETTER DOTLESS I;Ll;0;L;;;;;N;;;0049;;0049
  139. {UpperCase, 0x0131, 'I'},
  140. {LowerCase, 0x0131, 0x0131},
  141. {TitleCase, 0x0131, 'I'},
  142. // 0133;LATIN SMALL LIGATURE IJ;Ll;0;L;<compat> 0069 006A;;;;N;LATIN SMALL LETTER I J;;0132;;0132
  143. {UpperCase, 0x0133, 0x0132},
  144. {LowerCase, 0x0133, 0x0133},
  145. {TitleCase, 0x0133, 0x0132},
  146. // 212A;KELVIN SIGN;Lu;0;L;004B;;;;N;DEGREES KELVIN;;;006B;
  147. {UpperCase, 0x212A, 0x212A},
  148. {LowerCase, 0x212A, 'k'},
  149. {TitleCase, 0x212A, 0x212A},
  150. // From an UpperLower sequence
  151. // A640;CYRILLIC CAPITAL LETTER ZEMLYA;Lu;0;L;;;;;N;;;;A641;
  152. {UpperCase, 0xA640, 0xA640},
  153. {LowerCase, 0xA640, 0xA641},
  154. {TitleCase, 0xA640, 0xA640},
  155. // A641;CYRILLIC SMALL LETTER ZEMLYA;Ll;0;L;;;;;N;;;A640;;A640
  156. {UpperCase, 0xA641, 0xA640},
  157. {LowerCase, 0xA641, 0xA641},
  158. {TitleCase, 0xA641, 0xA640},
  159. // A64E;CYRILLIC CAPITAL LETTER NEUTRAL YER;Lu;0;L;;;;;N;;;;A64F;
  160. {UpperCase, 0xA64E, 0xA64E},
  161. {LowerCase, 0xA64E, 0xA64F},
  162. {TitleCase, 0xA64E, 0xA64E},
  163. // A65F;CYRILLIC SMALL LETTER YN;Ll;0;L;;;;;N;;;A65E;;A65E
  164. {UpperCase, 0xA65F, 0xA65E},
  165. {LowerCase, 0xA65F, 0xA65F},
  166. {TitleCase, 0xA65F, 0xA65E},
  167. // From another UpperLower sequence
  168. // 0139;LATIN CAPITAL LETTER L WITH ACUTE;Lu;0;L;004C 0301;;;;N;LATIN CAPITAL LETTER L ACUTE;;;013A;
  169. {UpperCase, 0x0139, 0x0139},
  170. {LowerCase, 0x0139, 0x013A},
  171. {TitleCase, 0x0139, 0x0139},
  172. // 013F;LATIN CAPITAL LETTER L WITH MIDDLE DOT;Lu;0;L;<compat> 004C 00B7;;;;N;;;;0140;
  173. {UpperCase, 0x013f, 0x013f},
  174. {LowerCase, 0x013f, 0x0140},
  175. {TitleCase, 0x013f, 0x013f},
  176. // 0148;LATIN SMALL LETTER N WITH CARON;Ll;0;L;006E 030C;;;;N;LATIN SMALL LETTER N HACEK;;0147;;0147
  177. {UpperCase, 0x0148, 0x0147},
  178. {LowerCase, 0x0148, 0x0148},
  179. {TitleCase, 0x0148, 0x0147},
  180. // Last block in the 5.1.0 table
  181. // 10400;DESERET CAPITAL LETTER LONG I;Lu;0;L;;;;;N;;;;10428;
  182. {UpperCase, 0x10400, 0x10400},
  183. {LowerCase, 0x10400, 0x10428},
  184. {TitleCase, 0x10400, 0x10400},
  185. // 10427;DESERET CAPITAL LETTER EW;Lu;0;L;;;;;N;;;;1044F;
  186. {UpperCase, 0x10427, 0x10427},
  187. {LowerCase, 0x10427, 0x1044F},
  188. {TitleCase, 0x10427, 0x10427},
  189. // 10428;DESERET SMALL LETTER LONG I;Ll;0;L;;;;;N;;;10400;;10400
  190. {UpperCase, 0x10428, 0x10400},
  191. {LowerCase, 0x10428, 0x10428},
  192. {TitleCase, 0x10428, 0x10400},
  193. // 1044F;DESERET SMALL LETTER EW;Ll;0;L;;;;;N;;;10427;;10427
  194. {UpperCase, 0x1044F, 0x10427},
  195. {LowerCase, 0x1044F, 0x1044F},
  196. {TitleCase, 0x1044F, 0x10427},
  197. // First one not in the 5.1.0 table
  198. // 10450;SHAVIAN LETTER PEEP;Lo;0;L;;;;;N;;;;;
  199. {UpperCase, 0x10450, 0x10450},
  200. {LowerCase, 0x10450, 0x10450},
  201. {TitleCase, 0x10450, 0x10450},
  202. // Non-letters with case.
  203. {LowerCase, 0x2161, 0x2171},
  204. {UpperCase, 0x0345, 0x0399},
  205. }
  206. func TestIsLetter(t *testing.T) {
  207. for _, r := range upperTest {
  208. if !IsLetter(r) {
  209. t.Errorf("IsLetter(U+%04X) = false, want true", r)
  210. }
  211. }
  212. for _, r := range letterTest {
  213. if !IsLetter(r) {
  214. t.Errorf("IsLetter(U+%04X) = false, want true", r)
  215. }
  216. }
  217. for _, r := range notletterTest {
  218. if IsLetter(r) {
  219. t.Errorf("IsLetter(U+%04X) = true, want false", r)
  220. }
  221. }
  222. }
  223. func TestIsUpper(t *testing.T) {
  224. for _, r := range upperTest {
  225. if !IsUpper(r) {
  226. t.Errorf("IsUpper(U+%04X) = false, want true", r)
  227. }
  228. }
  229. for _, r := range notupperTest {
  230. if IsUpper(r) {
  231. t.Errorf("IsUpper(U+%04X) = true, want false", r)
  232. }
  233. }
  234. for _, r := range notletterTest {
  235. if IsUpper(r) {
  236. t.Errorf("IsUpper(U+%04X) = true, want false", r)
  237. }
  238. }
  239. }
  240. func caseString(c int) string {
  241. switch c {
  242. case UpperCase:
  243. return "UpperCase"
  244. case LowerCase:
  245. return "LowerCase"
  246. case TitleCase:
  247. return "TitleCase"
  248. }
  249. return "ErrorCase"
  250. }
  251. func TestTo(t *testing.T) {
  252. for _, c := range caseTest {
  253. r := To(c.cas, c.in)
  254. if c.out != r {
  255. t.Errorf("To(U+%04X, %s) = U+%04X want U+%04X", c.in, caseString(c.cas), r, c.out)
  256. }
  257. }
  258. }
  259. func TestToUpperCase(t *testing.T) {
  260. for _, c := range caseTest {
  261. if c.cas != UpperCase {
  262. continue
  263. }
  264. r := ToUpper(c.in)
  265. if c.out != r {
  266. t.Errorf("ToUpper(U+%04X) = U+%04X want U+%04X", c.in, r, c.out)
  267. }
  268. }
  269. }
  270. func TestToLowerCase(t *testing.T) {
  271. for _, c := range caseTest {
  272. if c.cas != LowerCase {
  273. continue
  274. }
  275. r := ToLower(c.in)
  276. if c.out != r {
  277. t.Errorf("ToLower(U+%04X) = U+%04X want U+%04X", c.in, r, c.out)
  278. }
  279. }
  280. }
  281. func TestToTitleCase(t *testing.T) {
  282. for _, c := range caseTest {
  283. if c.cas != TitleCase {
  284. continue
  285. }
  286. r := ToTitle(c.in)
  287. if c.out != r {
  288. t.Errorf("ToTitle(U+%04X) = U+%04X want U+%04X", c.in, r, c.out)
  289. }
  290. }
  291. }
  292. func TestIsSpace(t *testing.T) {
  293. for _, c := range spaceTest {
  294. if !IsSpace(c) {
  295. t.Errorf("IsSpace(U+%04X) = false; want true", c)
  296. }
  297. }
  298. for _, c := range letterTest {
  299. if IsSpace(c) {
  300. t.Errorf("IsSpace(U+%04X) = true; want false", c)
  301. }
  302. }
  303. }
  304. // Check that the optimizations for IsLetter etc. agree with the tables.
  305. // We only need to check the Latin-1 range.
  306. func TestLetterOptimizations(t *testing.T) {
  307. for i := rune(0); i <= MaxLatin1; i++ {
  308. if Is(Letter, i) != IsLetter(i) {
  309. t.Errorf("IsLetter(U+%04X) disagrees with Is(Letter)", i)
  310. }
  311. if Is(Upper, i) != IsUpper(i) {
  312. t.Errorf("IsUpper(U+%04X) disagrees with Is(Upper)", i)
  313. }
  314. if Is(Lower, i) != IsLower(i) {
  315. t.Errorf("IsLower(U+%04X) disagrees with Is(Lower)", i)
  316. }
  317. if Is(Title, i) != IsTitle(i) {
  318. t.Errorf("IsTitle(U+%04X) disagrees with Is(Title)", i)
  319. }
  320. if Is(White_Space, i) != IsSpace(i) {
  321. t.Errorf("IsSpace(U+%04X) disagrees with Is(White_Space)", i)
  322. }
  323. if To(UpperCase, i) != ToUpper(i) {
  324. t.Errorf("ToUpper(U+%04X) disagrees with To(Upper)", i)
  325. }
  326. if To(LowerCase, i) != ToLower(i) {
  327. t.Errorf("ToLower(U+%04X) disagrees with To(Lower)", i)
  328. }
  329. if To(TitleCase, i) != ToTitle(i) {
  330. t.Errorf("ToTitle(U+%04X) disagrees with To(Title)", i)
  331. }
  332. }
  333. }
  334. func TestTurkishCase(t *testing.T) {
  335. lower := []rune("abcçdefgğhıijklmnoöprsştuüvyz")
  336. upper := []rune("ABCÇDEFGĞHIİJKLMNOÖPRSŞTUÜVYZ")
  337. for i, l := range lower {
  338. u := upper[i]
  339. if TurkishCase.ToLower(l) != l {
  340. t.Errorf("lower(U+%04X) is U+%04X not U+%04X", l, TurkishCase.ToLower(l), l)
  341. }
  342. if TurkishCase.ToUpper(u) != u {
  343. t.Errorf("upper(U+%04X) is U+%04X not U+%04X", u, TurkishCase.ToUpper(u), u)
  344. }
  345. if TurkishCase.ToUpper(l) != u {
  346. t.Errorf("upper(U+%04X) is U+%04X not U+%04X", l, TurkishCase.ToUpper(l), u)
  347. }
  348. if TurkishCase.ToLower(u) != l {
  349. t.Errorf("lower(U+%04X) is U+%04X not U+%04X", u, TurkishCase.ToLower(l), l)
  350. }
  351. if TurkishCase.ToTitle(u) != u {
  352. t.Errorf("title(U+%04X) is U+%04X not U+%04X", u, TurkishCase.ToTitle(u), u)
  353. }
  354. if TurkishCase.ToTitle(l) != u {
  355. t.Errorf("title(U+%04X) is U+%04X not U+%04X", l, TurkishCase.ToTitle(l), u)
  356. }
  357. }
  358. }
  359. var simpleFoldTests = []string{
  360. // SimpleFold(x) returns the next equivalent rune > x or wraps
  361. // around to smaller values.
  362. // Easy cases.
  363. "Aa",
  364. "δΔ",
  365. // ASCII special cases.
  366. "KkK",
  367. "Ssſ",
  368. // Non-ASCII special cases.
  369. "ρϱΡ",
  370. "ͅΙιι",
  371. // Extra special cases: has lower/upper but no case fold.
  372. "İ",
  373. "ı",
  374. }
  375. func TestSimpleFold(t *testing.T) {
  376. for _, tt := range simpleFoldTests {
  377. cycle := []rune(tt)
  378. r := cycle[len(cycle)-1]
  379. for _, out := range cycle {
  380. if r := SimpleFold(r); r != out {
  381. t.Errorf("SimpleFold(%#U) = %#U, want %#U", r, r, out)
  382. }
  383. r = out
  384. }
  385. }
  386. }
  387. // Running 'go test -calibrate' runs the calibration to find a plausible
  388. // cutoff point for linear search of a range list vs. binary search.
  389. // We create a fake table and then time how long it takes to do a
  390. // sequence of searches within that table, for all possible inputs
  391. // relative to the ranges (something before all, in each, between each, after all).
  392. // This assumes that all possible runes are equally likely.
  393. // In practice most runes are ASCII so this is a conservative estimate
  394. // of an effective cutoff value. In practice we could probably set it higher
  395. // than what this function recommends.
  396. var calibrate = flag.Bool("calibrate", false, "compute crossover for linear vs. binary search")
  397. func TestCalibrate(t *testing.T) {
  398. if !*calibrate {
  399. return
  400. }
  401. if runtime.GOARCH == "amd64" {
  402. fmt.Printf("warning: running calibration on %s\n", runtime.GOARCH)
  403. }
  404. // Find the point where binary search wins by more than 10%.
  405. // The 10% bias gives linear search an edge when they're close,
  406. // because on predominantly ASCII inputs linear search is even
  407. // better than our benchmarks measure.
  408. n := sort.Search(64, func(n int) bool {
  409. tab := fakeTable(n)
  410. blinear := func(b *testing.B) {
  411. tab := tab
  412. max := n*5 + 20
  413. for i := 0; i < b.N; i++ {
  414. for j := 0; j <= max; j++ {
  415. linear(tab, uint16(j))
  416. }
  417. }
  418. }
  419. bbinary := func(b *testing.B) {
  420. tab := tab
  421. max := n*5 + 20
  422. for i := 0; i < b.N; i++ {
  423. for j := 0; j <= max; j++ {
  424. binary(tab, uint16(j))
  425. }
  426. }
  427. }
  428. bmlinear := testing.Benchmark(blinear)
  429. bmbinary := testing.Benchmark(bbinary)
  430. fmt.Printf("n=%d: linear=%d binary=%d\n", n, bmlinear.NsPerOp(), bmbinary.NsPerOp())
  431. return bmlinear.NsPerOp()*100 > bmbinary.NsPerOp()*110
  432. })
  433. fmt.Printf("calibration: linear cutoff = %d\n", n)
  434. }
  435. func fakeTable(n int) []Range16 {
  436. var r16 []Range16
  437. for i := 0; i < n; i++ {
  438. r16 = append(r16, Range16{uint16(i*5 + 10), uint16(i*5 + 12), 1})
  439. }
  440. return r16
  441. }
  442. func linear(ranges []Range16, r uint16) bool {
  443. for i := range ranges {
  444. range_ := &ranges[i]
  445. if r < range_.Lo {
  446. return false
  447. }
  448. if r <= range_.Hi {
  449. return (r-range_.Lo)%range_.Stride == 0
  450. }
  451. }
  452. return false
  453. }
  454. func binary(ranges []Range16, r uint16) bool {
  455. // binary search over ranges
  456. lo := 0
  457. hi := len(ranges)
  458. for lo < hi {
  459. m := lo + (hi-lo)/2
  460. range_ := &ranges[m]
  461. if range_.Lo <= r && r <= range_.Hi {
  462. return (r-range_.Lo)%range_.Stride == 0
  463. }
  464. if r < range_.Lo {
  465. hi = m
  466. } else {
  467. lo = m + 1
  468. }
  469. }
  470. return false
  471. }
  472. func TestLatinOffset(t *testing.T) {
  473. var maps = []map[string]*RangeTable{
  474. Categories,
  475. FoldCategory,
  476. FoldScript,
  477. Properties,
  478. Scripts,
  479. }
  480. for _, m := range maps {
  481. for name, tab := range m {
  482. i := 0
  483. for i < len(tab.R16) && tab.R16[i].Hi <= MaxLatin1 {
  484. i++
  485. }
  486. if tab.LatinOffset != i {
  487. t.Errorf("%s: LatinOffset=%d, want %d", name, tab.LatinOffset, i)
  488. }
  489. }
  490. }
  491. }