- additional methods for multibyte UFT8 character detection

2023-06-03 13:43:26 +02:00 · 2023-06-03 13:43:26 +02:00 · 7d9b30fff2
commit 7d9b30fff2
parent 396e140102
3 changed files with 149 additions and 1 deletions
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@ -3,6 +3,10 @@ All notable changes to this project will be documented in this file.
 The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/), 
 and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).

+## Version 1.1.7 - 2022-xx-yy
+### Fixed
+- handling of german umlauts (and other unicode characters)
+
 ## Version 1.1.6 - 2023-01-18
 ### Changed
 - bumped version for latest WotLK Classic patch
--- a/GricheldeConstants.lua
+++ b/GricheldeConstants.lua
@ -287,6 +287,97 @@ local function getNextCharUtf8(word)
    end
 end

+local function isUtf8MultiByte(word)
+    return Grichelde.F.length(word) ~= Grichelde.F.lengthUtf8(word)
+end
+
+local function getUtf8Sequence(word)
+    if ((word == nil) or (Grichelde.F.type(word) ~= "string") or (Grichelde.F.lengthUtf8(word) ~= 1)) then
+        return nil
+    end
+
+    --[[
+     You could use the following code snippet to iterate over UTF-8 sequences
+     (this will simply skip over most invalid codes):
+
+        for uchar in string.gmatch(ustring, "([%z\1-\127\194-\244][\128-\191]*)") do
+          ...
+        end
+    ]]--
+
+    local sequence = "%z"
+    local c1 = Grichelde.F.toByte(word, 1)
+    sequence = sequence .. "\\" .. c1
+
+    if (c1 > 0) and (c1 <= 127) then
+        -- UTF8-1
+        return sequence
+    end
+
+    local c2 = Grichelde.F.toByte(word, 2)
+    sequence = sequence .. "\\" .. c2
+    if (c1 >= 194) and (c1 <= 223) then
+        -- UTF8-2
+        return sequence
+    end
+
+    local c3 = Grichelde.F.toByte(word, 3)
+    sequence = sequence .. "\\" .. c3
+    if (c1 >= 224) and (c1 <= 239) then
+        -- UTF8-3
+        return sequence
+    end
+
+    local c4 = Grichelde.F.toByte(word, 4)
+    sequence = sequence .. "\\" .. c4
+    if (c1 >= 240) and (c1 <= 244) then
+        -- UTF8-4
+        return sequence
+    end
+
+    return nil
+end
+
+local function getUtf8Table(word)
+    if ((word == nil) or (Grichelde.F.type(word) ~= "string") or (Grichelde.F.lengthUtf8(word) ~= 1)) then
+        return nil
+    end
+
+    --[[
+     You could use the following code snippet to iterate over UTF-8 sequences
+     (this will simply skip over most invalid codes):
+
+        for uchar in string.gmatch(ustring, "([%z\1-\127\194-\244][\128-\191]*)") do
+          ...
+        end
+    ]]--
+
+    local tbl = {}
+
+    local c1 = Grichelde.F.toByte(word, 1)
+    Grichelde.F.tInsert(tbl, "%z\\" .. c1)
+
+    local c2 = Grichelde.F.toByte(word, 2)
+    if (c1 >= 194) and (c1 <= 223) then
+        -- UTF8-2
+        Grichelde.F.tInsert(tbl, "\\" .. c2)
+    end
+
+    local c3 = Grichelde.F.toByte(word, 3)
+    if (c1 >= 224) and (c1 <= 239) then
+        -- UTF8-3
+        Grichelde.F.tInsert(tbl, "\\" .. c3)
+    end
+
+    local c4 = Grichelde.F.toByte(word, 4)
+    if (c1 >= 240) and (c1 <= 244) then
+        -- UTF8-4
+        Grichelde.F.tInsert(tbl, "\\" .. c4)
+    end
+
+    return tbl
+end
+
 local function isLetter(word)
    local char = Grichelde.F.getNextCharUtf8(word)
    return (char ~= nil) and (Grichelde.F.toUpper(char) ~= Grichelde.F.toLower(char))
@ -418,6 +509,9 @@ Grichelde.F = {
    toUpper         = _G.strupper,
    toLower         = _G.strlower,
    getNextCharUtf8 = getNextCharUtf8,
+    isUtf8MultiByte = isUtf8MultiByte,
+    getUtf8Sequence = getUtf8Sequence,
+    getUtf8Table    = getUtf8Table,
    isLetter        = isLetter,
    isNumber        = isNumber,
    isUpper         = isUpper,
--- a/GricheldeTest.lua
+++ b/GricheldeTest.lua
@ -72,7 +72,7 @@ function Grichelde:RunTests()
            ["OSSO"] = "OCHO",
            ["ooSS"] = "ooCH",
            ["schmeissen"] = "chmeichen",
-            ["Schön"] = "Chön",
+            ["Sch&#246;n"] = "Chön",
        }
    )
    ok = ok + o
@ -776,6 +776,56 @@ function Grichelde:RunTests()
    ok = ok + o
    all = all + a

+    o, a = test(
+        "umlauts",
+        {
+            replacement_10 = {
+                order = 10,
+                searchText = "ä",
+                replaceText = "ae",
+                exactCase = false,
+                consolidate = false,
+                matchWhen = 2,
+                stopOnMatch = false,
+            },
+            replacement_11 = {
+                order = 11,
+                searchText = "ö",
+                replaceText = "oe",
+                exactCase = false,
+                consolidate = false,
+                matchWhen = 2,
+                stopOnMatch = false,
+            },
+            replacement_12 = {
+                order = 12,
+                searchText = "ü",
+                replaceText = "ue",
+                exactCase = false,
+                consolidate = false,
+                matchWhen = 2,
+                stopOnMatch = false,
+            },
+            replacement_13 = {
+                order = 13,
+                searchText = "ß",
+                replaceText = "ss",
+                exactCase = false,
+                consolidate = false,
+                matchWhen = 2,
+                stopOnMatch = false,
+            },
+        },
+        {
+            ["Hallä"] = "Hallae",
+            ["Ätsch"] = "Aetsch",
+            ["Hällöleü"] = "Haelloeleue",
+            ["ÜöÄ"] = "UeoeAe",
+        }
+    )
+    ok = ok + o
+    all = all + a
+
    if (ok == all) then
        self:PrefixedPrint("All %d tests %s", all, cGreen("passed"))
    else