123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165 |
- -- Provides UTF-8 aware string functions implemented in pure lua:
- -- * string.utf8len(s)
- -- * string.utf8sub(s, i, j)
- --
- -- All functions behave as their non UTF-8 aware counterparts with the exception
- -- that UTF-8 characters are used instead of bytes for all units.
- --
- -- Note: all validations had been removed due to awesome usage specifics.
- --[[
- Copyright (c) 2006-2007, Kyle Smith
- Modified by Alexander Yakushev, 2010-2013.
- All rights reserved.
- Redistribution and use in source and binary forms, with or without
- modification, are permitted provided that the following conditions are met:
- * Redistributions of source code must retain the above copyright notice,
- this list of conditions and the following disclaimer.
- * Redistributions in binary form must reproduce the above copyright
- notice, this list of conditions and the following disclaimer in the
- documentation and/or other materials provided with the distribution.
- * Neither the name of the author nor the names of its contributors may be
- used to endorse or promote products derived from this software without
- specific prior written permission.
- THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE
- FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- --]]
- -- ABNF from RFC 3629
- --
- -- UTF8-octets = *( UTF8-char )
- -- UTF8-char = UTF8-1 / UTF8-2 / UTF8-3 / UTF8-4
- -- UTF8-1 = %x00-7F
- -- UTF8-2 = %xC2-DF UTF8-tail
- -- UTF8-3 = %xE0 %xA0-BF UTF8-tail / %xE1-EC 2( UTF8-tail ) /
- -- %xED %x80-9F UTF8-tail / %xEE-EF 2( UTF8-tail )
- -- UTF8-4 = %xF0 %x90-BF 2( UTF8-tail ) / %xF1-F3 3( UTF8-tail ) /
- -- %xF4 %x80-8F 2( UTF8-tail )
- -- UTF8-tail = %x80-BF
- --
- -- returns the number of bytes used by the UTF-8 character at byte i in s
- -- also doubles as a UTF-8 character validator
- local utf8 = {}
- function utf8.charbytes (s, i)
- -- argument defaults
- i = i or 1
- local c = string.byte(s, i)
-
- -- determine bytes needed for character, based on RFC 3629
- if c > 0 and c <= 127 then
- -- UTF8-1
- return 1
- elseif c >= 194 and c <= 223 then
- -- UTF8-2
- local c2 = string.byte(s, i + 1)
- return 2
- elseif c >= 224 and c <= 239 then
- -- UTF8-3
- local c2 = s:byte(i + 1)
- local c3 = s:byte(i + 2)
- return 3
- elseif c >= 240 and c <= 244 then
- -- UTF8-4
- local c2 = s:byte(i + 1)
- local c3 = s:byte(i + 2)
- local c4 = s:byte(i + 3)
- return 4
- end
- end
- -- returns the number of characters in a UTF-8 string
- function utf8.len (s)
- local pos = 1
- local bytes = string.len(s)
- local len = 0
-
- while pos <= bytes and len ~= chars do
- local c = string.byte(s,pos)
- len = len + 1
-
- pos = pos + utf8.charbytes(s, pos)
- end
-
- if chars ~= nil then
- return pos - 1
- end
-
- return len
- end
- -- functions identically to string.sub except that i and j are UTF-8 characters
- -- instead of bytes
- function utf8.sub (s, i, j)
- j = j or -1
- if i == nil then
- return ""
- end
-
- local pos = 1
- local bytes = string.len(s)
- local len = 0
- -- only set l if i or j is negative
- local l = (i >= 0 and j >= 0) or utf8.len(s)
- local startChar = (i >= 0) and i or l + i + 1
- local endChar = (j >= 0) and j or l + j + 1
- -- can't have start before end!
- if startChar > endChar then
- return ""
- end
-
- -- byte offsets to pass to string.sub
- local startByte, endByte = 1, bytes
-
- while pos <= bytes do
- len = len + 1
-
- if len == startChar then
- startByte = pos
- end
-
- pos = pos + utf8.charbytes(s, pos)
-
- if len == endChar then
- endByte = pos - 1
- break
- end
- end
-
- return string.sub(s, startByte, endByte)
- end
- -- replace UTF-8 characters based on a mapping table
- function utf8.replace (s, mapping)
- local pos = 1
- local bytes = string.len(s)
- local charbytes
- local newstr = ""
- while pos <= bytes do
- charbytes = utf8.charbytes(s, pos)
- local c = string.sub(s, pos, pos + charbytes - 1)
- newstr = newstr .. (mapping[c] or c)
- pos = pos + charbytes
- end
- return newstr
- end
- return utf8
|