From 7d9b30fff20b89201a88e2f5fd2b4bafcd0da94f Mon Sep 17 00:00:00 2001 From: Lothar Buchholz Date: Sat, 3 Jun 2023 13:43:26 +0200 Subject: [PATCH] - additional methods for multibyte UFT8 character detection --- CHANGELOG.md | 4 ++ GricheldeConstants.lua | 94 ++++++++++++++++++++++++++++++++++++++++++ GricheldeTest.lua | 52 ++++++++++++++++++++++- 3 files changed, 149 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index acfceb8..ab7969a 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -3,6 +3,10 @@ All notable changes to this project will be documented in this file. The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/), and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). +## Version 1.1.7 - 2022-xx-yy +### Fixed +- handling of german umlauts (and other unicode characters) + ## Version 1.1.6 - 2023-01-18 ### Changed - bumped version for latest WotLK Classic patch diff --git a/GricheldeConstants.lua b/GricheldeConstants.lua index e1074d5..2d58159 100644 --- a/GricheldeConstants.lua +++ b/GricheldeConstants.lua @@ -287,6 +287,97 @@ local function getNextCharUtf8(word) end end +local function isUtf8MultiByte(word) + return Grichelde.F.length(word) ~= Grichelde.F.lengthUtf8(word) +end + +local function getUtf8Sequence(word) + if ((word == nil) or (Grichelde.F.type(word) ~= "string") or (Grichelde.F.lengthUtf8(word) ~= 1)) then + return nil + end + + --[[ + You could use the following code snippet to iterate over UTF-8 sequences + (this will simply skip over most invalid codes): + + for uchar in string.gmatch(ustring, "([%z\1-\127\194-\244][\128-\191]*)") do + ... + end + ]]-- + + local sequence = "%z" + local c1 = Grichelde.F.toByte(word, 1) + sequence = sequence .. "\\" .. c1 + + if (c1 > 0) and (c1 <= 127) then + -- UTF8-1 + return sequence + end + + local c2 = Grichelde.F.toByte(word, 2) + sequence = sequence .. "\\" .. c2 + if (c1 >= 194) and (c1 <= 223) then + -- UTF8-2 + return sequence + end + + local c3 = Grichelde.F.toByte(word, 3) + sequence = sequence .. "\\" .. c3 + if (c1 >= 224) and (c1 <= 239) then + -- UTF8-3 + return sequence + end + + local c4 = Grichelde.F.toByte(word, 4) + sequence = sequence .. "\\" .. c4 + if (c1 >= 240) and (c1 <= 244) then + -- UTF8-4 + return sequence + end + + return nil +end + +local function getUtf8Table(word) + if ((word == nil) or (Grichelde.F.type(word) ~= "string") or (Grichelde.F.lengthUtf8(word) ~= 1)) then + return nil + end + + --[[ + You could use the following code snippet to iterate over UTF-8 sequences + (this will simply skip over most invalid codes): + + for uchar in string.gmatch(ustring, "([%z\1-\127\194-\244][\128-\191]*)") do + ... + end + ]]-- + + local tbl = {} + + local c1 = Grichelde.F.toByte(word, 1) + Grichelde.F.tInsert(tbl, "%z\\" .. c1) + + local c2 = Grichelde.F.toByte(word, 2) + if (c1 >= 194) and (c1 <= 223) then + -- UTF8-2 + Grichelde.F.tInsert(tbl, "\\" .. c2) + end + + local c3 = Grichelde.F.toByte(word, 3) + if (c1 >= 224) and (c1 <= 239) then + -- UTF8-3 + Grichelde.F.tInsert(tbl, "\\" .. c3) + end + + local c4 = Grichelde.F.toByte(word, 4) + if (c1 >= 240) and (c1 <= 244) then + -- UTF8-4 + Grichelde.F.tInsert(tbl, "\\" .. c4) + end + + return tbl +end + local function isLetter(word) local char = Grichelde.F.getNextCharUtf8(word) return (char ~= nil) and (Grichelde.F.toUpper(char) ~= Grichelde.F.toLower(char)) @@ -418,6 +509,9 @@ Grichelde.F = { toUpper = _G.strupper, toLower = _G.strlower, getNextCharUtf8 = getNextCharUtf8, + isUtf8MultiByte = isUtf8MultiByte, + getUtf8Sequence = getUtf8Sequence, + getUtf8Table = getUtf8Table, isLetter = isLetter, isNumber = isNumber, isUpper = isUpper, diff --git a/GricheldeTest.lua b/GricheldeTest.lua index 46f8f18..ce9db43 100644 --- a/GricheldeTest.lua +++ b/GricheldeTest.lua @@ -72,7 +72,7 @@ function Grichelde:RunTests() ["OSSO"] = "OCHO", ["ooSS"] = "ooCH", ["schmeissen"] = "chmeichen", - ["Schön"] = "Chön", + ["Schön"] = "Chön", } ) ok = ok + o @@ -776,6 +776,56 @@ function Grichelde:RunTests() ok = ok + o all = all + a + o, a = test( + "umlauts", + { + replacement_10 = { + order = 10, + searchText = "ä", + replaceText = "ae", + exactCase = false, + consolidate = false, + matchWhen = 2, + stopOnMatch = false, + }, + replacement_11 = { + order = 11, + searchText = "ö", + replaceText = "oe", + exactCase = false, + consolidate = false, + matchWhen = 2, + stopOnMatch = false, + }, + replacement_12 = { + order = 12, + searchText = "ü", + replaceText = "ue", + exactCase = false, + consolidate = false, + matchWhen = 2, + stopOnMatch = false, + }, + replacement_13 = { + order = 13, + searchText = "ß", + replaceText = "ss", + exactCase = false, + consolidate = false, + matchWhen = 2, + stopOnMatch = false, + }, + }, + { + ["Hallä"] = "Hallae", + ["Ätsch"] = "Aetsch", + ["Hällöleü"] = "Haelloeleue", + ["ÜöÄ"] = "UeoeAe", + } + ) + ok = ok + o + all = all + a + if (ok == all) then self:PrefixedPrint("All %d tests %s", all, cGreen("passed")) else