- additional methods for multibyte UFT8 character detection

2 years ago · 7d9b30fff2
parent 396e140102
commit 7d9b30fff2
3 changed files with 149 additions and 1 deletions
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@ -3,6 +3,10 @@ All notable changes to this project will be documented in this file.
 The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/), 
 and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
 ## Version 1.1.7 - 2022-xx-yy
 ### Fixed
 - handling of german umlauts (and other unicode characters)
 ## Version 1.1.6 - 2023-01-18
 ### Changed
 - bumped version for latest WotLK Classic patch
--- a/GricheldeConstants.lua
+++ b/GricheldeConstants.lua
@ -287,6 +287,97 @@ local function getNextCharUtf8(word)
    end
 end
 local function isUtf8MultiByte(word)
    return Grichelde.F.length(word) ~= Grichelde.F.lengthUtf8(word)
 end
 local function getUtf8Sequence(word)
    if ((word == nil) or (Grichelde.F.type(word) ~= "string") or (Grichelde.F.lengthUtf8(word) ~= 1)) then
        return nil
    end
    --[[
     You could use the following code snippet to iterate over UTF-8 sequences
     (this will simply skip over most invalid codes):
        for uchar in string.gmatch(ustring, "([%z\1-\127\194-\244][\128-\191]*)") do
          ...
        end
    ]]--
    local sequence = "%z"
    local c1 = Grichelde.F.toByte(word, 1)
    sequence = sequence .. "\\" .. c1
    if (c1 > 0) and (c1 <= 127) then
        -- UTF8-1
        return sequence
    end
    local c2 = Grichelde.F.toByte(word, 2)
    sequence = sequence .. "\\" .. c2
    if (c1 >= 194) and (c1 <= 223) then
        -- UTF8-2
        return sequence
    end
    local c3 = Grichelde.F.toByte(word, 3)
    sequence = sequence .. "\\" .. c3
    if (c1 >= 224) and (c1 <= 239) then
        -- UTF8-3
        return sequence
    end
    local c4 = Grichelde.F.toByte(word, 4)
    sequence = sequence .. "\\" .. c4
    if (c1 >= 240) and (c1 <= 244) then
        -- UTF8-4
        return sequence
    end
    return nil
 end
 local function getUtf8Table(word)
    if ((word == nil) or (Grichelde.F.type(word) ~= "string") or (Grichelde.F.lengthUtf8(word) ~= 1)) then
        return nil
    end
    --[[
     You could use the following code snippet to iterate over UTF-8 sequences
     (this will simply skip over most invalid codes):
        for uchar in string.gmatch(ustring, "([%z\1-\127\194-\244][\128-\191]*)") do
          ...
        end
    ]]--
    local tbl = {}
    local c1 = Grichelde.F.toByte(word, 1)
    Grichelde.F.tInsert(tbl, "%z\\" .. c1)
    local c2 = Grichelde.F.toByte(word, 2)
    if (c1 >= 194) and (c1 <= 223) then
        -- UTF8-2
        Grichelde.F.tInsert(tbl, "\\" .. c2)
    end
    local c3 = Grichelde.F.toByte(word, 3)
    if (c1 >= 224) and (c1 <= 239) then
        -- UTF8-3
        Grichelde.F.tInsert(tbl, "\\" .. c3)
    end
    local c4 = Grichelde.F.toByte(word, 4)
    if (c1 >= 240) and (c1 <= 244) then
        -- UTF8-4
        Grichelde.F.tInsert(tbl, "\\" .. c4)
    end
    return tbl
 end
 local function isLetter(word)
    local char = Grichelde.F.getNextCharUtf8(word)
    return (char ~= nil) and (Grichelde.F.toUpper(char) ~= Grichelde.F.toLower(char))
@ -418,6 +509,9 @@ Grichelde.F = {
    toUpper         = _G.strupper,
    toLower         = _G.strlower,
    getNextCharUtf8 = getNextCharUtf8,
    isUtf8MultiByte = isUtf8MultiByte,
    getUtf8Sequence = getUtf8Sequence,
    getUtf8Table    = getUtf8Table,
    isLetter        = isLetter,
    isNumber        = isNumber,
    isUpper         = isUpper,
--- a/GricheldeTest.lua
+++ b/GricheldeTest.lua
@ -72,7 +72,7 @@ function Grichelde:RunTests()
            ["OSSO"] = "OCHO",
            ["ooSS"] = "ooCH",
            ["schmeissen"] = "chmeichen",
-            ["Schön"] = "Chön",
+            ["Sch&#246;n"] = "Chön",
        }
    )
    ok = ok + o
@ -776,6 +776,56 @@ function Grichelde:RunTests()
    ok = ok + o
    all = all + a
    o, a = test(
        "umlauts",
        {
            replacement_10 = {
                order = 10,
                searchText = "ä",
                replaceText = "ae",
                exactCase = false,
                consolidate = false,
                matchWhen = 2,
                stopOnMatch = false,
            },
            replacement_11 = {
                order = 11,
                searchText = "ö",
                replaceText = "oe",
                exactCase = false,
                consolidate = false,
                matchWhen = 2,
                stopOnMatch = false,
            },
            replacement_12 = {
                order = 12,
                searchText = "ü",
                replaceText = "ue",
                exactCase = false,
                consolidate = false,
                matchWhen = 2,
                stopOnMatch = false,
            },
            replacement_13 = {
                order = 13,
                searchText = "ß",
                replaceText = "ss",
                exactCase = false,
                consolidate = false,
                matchWhen = 2,
                stopOnMatch = false,
            },
        },
        {
            ["Hallä"] = "Hallae",
            ["Ätsch"] = "Aetsch",
            ["Hällöleü"] = "Haelloeleue",
            ["ÜöÄ"] = "UeoeAe",
        }
    )
    ok = ok + o
    all = all + a
    if (ok == all) then
        self:PrefixedPrint("All %d tests %s", all, cGreen("passed"))
    else