utf-8.go 2.6 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071
  1. // License: GPLv3 Copyright: 2022, Kovid Goyal, <kovid at kovidgoyal.net>
  2. package utils
  3. // UTF-8 decode taken from: https://bjoern.hoehrmann.de/utf-8/decoder/dfa/
  4. type UTF8State uint32
  5. var utf8_data []uint8 = []uint8{
  6. 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 00..1f
  7. 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 20..3f
  8. 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 40..5f
  9. 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 60..7f
  10. 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, // 80..9f
  11. 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, // a0..bf
  12. 8, 8, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, // c0..df
  13. 0xa, 0x3, 0x3, 0x3, 0x3, 0x3, 0x3, 0x3, 0x3, 0x3, 0x3, 0x3, 0x3, 0x4, 0x3, 0x3, // e0..ef
  14. 0xb, 0x6, 0x6, 0x6, 0x5, 0x8, 0x8, 0x8, 0x8, 0x8, 0x8, 0x8, 0x8, 0x8, 0x8, 0x8, // f0..ff
  15. 0x0, 0x1, 0x2, 0x3, 0x5, 0x8, 0x7, 0x1, 0x1, 0x1, 0x4, 0x6, 0x1, 0x1, 0x1, 0x1, // s0..s0
  16. 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, // s1..s2
  17. 1, 2, 1, 1, 1, 1, 1, 2, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, // s3..s4
  18. 1, 2, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 3, 1, 3, 1, 1, 1, 1, 1, 1, // s5..s6
  19. 1, 3, 1, 1, 1, 1, 1, 3, 1, 3, 1, 1, 1, 1, 1, 1, 1, 3, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // s7..s8
  20. }
  21. const (
  22. UTF8_ACCEPT UTF8State = 0
  23. UTF8_REJECT UTF8State = 1
  24. )
  25. func DecodeUtf8(state *UTF8State, codep *UTF8State, byte_ byte) UTF8State {
  26. typ := UTF8State(utf8_data[byte_])
  27. b := UTF8State(byte_)
  28. if *state != UTF8_ACCEPT {
  29. *codep = (b & 0x3f) | (*codep << 6)
  30. } else {
  31. *codep = (0xff >> typ) & (b)
  32. }
  33. idx := 256 + *state*16 + typ
  34. *state = UTF8State(utf8_data[idx])
  35. return *state
  36. }
  37. func EncodeUtf8(ch UTF8State, dest []byte) int {
  38. if ch < 0x80 {
  39. dest[0] = byte(ch)
  40. return 1
  41. }
  42. if ch < 0x800 {
  43. dest[0] = byte((ch >> 6) | 0xC0)
  44. dest[1] = byte((ch & 0x3F) | 0x80)
  45. return 2
  46. }
  47. if ch < 0x10000 {
  48. dest[0] = byte((ch >> 12) | 0xE0)
  49. dest[1] = byte(((ch >> 6) & 0x3F) | 0x80)
  50. dest[2] = byte((ch & 0x3F) | 0x80)
  51. return 3
  52. }
  53. if ch < 0x110000 {
  54. dest[0] = byte((ch >> 18) | 0xF0)
  55. dest[1] = byte(((ch >> 12) & 0x3F) | 0x80)
  56. dest[2] = byte(((ch >> 6) & 0x3F) | 0x80)
  57. dest[3] = byte((ch & 0x3F) | 0x80)
  58. return 4
  59. }
  60. return 0
  61. }