utf8.lua 4.6 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165
  1. -- Provides UTF-8 aware string functions implemented in pure lua:
  2. -- * string.utf8len(s)
  3. -- * string.utf8sub(s, i, j)
  4. --
  5. -- All functions behave as their non UTF-8 aware counterparts with the exception
  6. -- that UTF-8 characters are used instead of bytes for all units.
  7. --
  8. -- Note: all validations had been removed due to awesome usage specifics.
  9. --[[
  10. Copyright (c) 2006-2007, Kyle Smith
  11. Modified by Alexander Yakushev, 2010-2013.
  12. All rights reserved.
  13. Redistribution and use in source and binary forms, with or without
  14. modification, are permitted provided that the following conditions are met:
  15. * Redistributions of source code must retain the above copyright notice,
  16. this list of conditions and the following disclaimer.
  17. * Redistributions in binary form must reproduce the above copyright
  18. notice, this list of conditions and the following disclaimer in the
  19. documentation and/or other materials provided with the distribution.
  20. * Neither the name of the author nor the names of its contributors may be
  21. used to endorse or promote products derived from this software without
  22. specific prior written permission.
  23. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
  24. AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  25. IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
  26. DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE
  27. FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  28. DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
  29. SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
  30. CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
  31. OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  32. OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  33. --]]
  34. -- ABNF from RFC 3629
  35. --
  36. -- UTF8-octets = *( UTF8-char )
  37. -- UTF8-char = UTF8-1 / UTF8-2 / UTF8-3 / UTF8-4
  38. -- UTF8-1 = %x00-7F
  39. -- UTF8-2 = %xC2-DF UTF8-tail
  40. -- UTF8-3 = %xE0 %xA0-BF UTF8-tail / %xE1-EC 2( UTF8-tail ) /
  41. -- %xED %x80-9F UTF8-tail / %xEE-EF 2( UTF8-tail )
  42. -- UTF8-4 = %xF0 %x90-BF 2( UTF8-tail ) / %xF1-F3 3( UTF8-tail ) /
  43. -- %xF4 %x80-8F 2( UTF8-tail )
  44. -- UTF8-tail = %x80-BF
  45. --
  46. -- returns the number of bytes used by the UTF-8 character at byte i in s
  47. -- also doubles as a UTF-8 character validator
  48. local utf8 = {}
  49. function utf8.charbytes (s, i)
  50. -- argument defaults
  51. i = i or 1
  52. local c = string.byte(s, i)
  53. -- determine bytes needed for character, based on RFC 3629
  54. if c > 0 and c <= 127 then
  55. -- UTF8-1
  56. return 1
  57. elseif c >= 194 and c <= 223 then
  58. -- UTF8-2
  59. local c2 = string.byte(s, i + 1)
  60. return 2
  61. elseif c >= 224 and c <= 239 then
  62. -- UTF8-3
  63. local c2 = s:byte(i + 1)
  64. local c3 = s:byte(i + 2)
  65. return 3
  66. elseif c >= 240 and c <= 244 then
  67. -- UTF8-4
  68. local c2 = s:byte(i + 1)
  69. local c3 = s:byte(i + 2)
  70. local c4 = s:byte(i + 3)
  71. return 4
  72. end
  73. end
  74. -- returns the number of characters in a UTF-8 string
  75. function utf8.len (s)
  76. local pos = 1
  77. local bytes = string.len(s)
  78. local len = 0
  79. while pos <= bytes and len ~= chars do
  80. local c = string.byte(s,pos)
  81. len = len + 1
  82. pos = pos + utf8.charbytes(s, pos)
  83. end
  84. if chars ~= nil then
  85. return pos - 1
  86. end
  87. return len
  88. end
  89. -- functions identically to string.sub except that i and j are UTF-8 characters
  90. -- instead of bytes
  91. function utf8.sub (s, i, j)
  92. j = j or -1
  93. if i == nil then
  94. return ""
  95. end
  96. local pos = 1
  97. local bytes = string.len(s)
  98. local len = 0
  99. -- only set l if i or j is negative
  100. local l = (i >= 0 and j >= 0) or utf8.len(s)
  101. local startChar = (i >= 0) and i or l + i + 1
  102. local endChar = (j >= 0) and j or l + j + 1
  103. -- can't have start before end!
  104. if startChar > endChar then
  105. return ""
  106. end
  107. -- byte offsets to pass to string.sub
  108. local startByte, endByte = 1, bytes
  109. while pos <= bytes do
  110. len = len + 1
  111. if len == startChar then
  112. startByte = pos
  113. end
  114. pos = pos + utf8.charbytes(s, pos)
  115. if len == endChar then
  116. endByte = pos - 1
  117. break
  118. end
  119. end
  120. return string.sub(s, startByte, endByte)
  121. end
  122. -- replace UTF-8 characters based on a mapping table
  123. function utf8.replace (s, mapping)
  124. local pos = 1
  125. local bytes = string.len(s)
  126. local charbytes
  127. local newstr = ""
  128. while pos <= bytes do
  129. charbytes = utf8.charbytes(s, pos)
  130. local c = string.sub(s, pos, pos + charbytes - 1)
  131. newstr = newstr .. (mapping[c] or c)
  132. pos = pos + charbytes
  133. end
  134. return newstr
  135. end
  136. return utf8