- additional methods for multibyte UFT8 character detection

umlaute
Lothar Buchholz 2 years ago
parent 396e140102
commit 7d9b30fff2

@ -3,6 +3,10 @@ All notable changes to this project will be documented in this file.
The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/), The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
## Version 1.1.7 - 2022-xx-yy
### Fixed
- handling of german umlauts (and other unicode characters)
## Version 1.1.6 - 2023-01-18 ## Version 1.1.6 - 2023-01-18
### Changed ### Changed
- bumped version for latest WotLK Classic patch - bumped version for latest WotLK Classic patch

@ -287,6 +287,97 @@ local function getNextCharUtf8(word)
end end
end end
local function isUtf8MultiByte(word)
return Grichelde.F.length(word) ~= Grichelde.F.lengthUtf8(word)
end
local function getUtf8Sequence(word)
if ((word == nil) or (Grichelde.F.type(word) ~= "string") or (Grichelde.F.lengthUtf8(word) ~= 1)) then
return nil
end
--[[
You could use the following code snippet to iterate over UTF-8 sequences
(this will simply skip over most invalid codes):
for uchar in string.gmatch(ustring, "([%z\1-\127\194-\244][\128-\191]*)") do
...
end
]]--
local sequence = "%z"
local c1 = Grichelde.F.toByte(word, 1)
sequence = sequence .. "\\" .. c1
if (c1 > 0) and (c1 <= 127) then
-- UTF8-1
return sequence
end
local c2 = Grichelde.F.toByte(word, 2)
sequence = sequence .. "\\" .. c2
if (c1 >= 194) and (c1 <= 223) then
-- UTF8-2
return sequence
end
local c3 = Grichelde.F.toByte(word, 3)
sequence = sequence .. "\\" .. c3
if (c1 >= 224) and (c1 <= 239) then
-- UTF8-3
return sequence
end
local c4 = Grichelde.F.toByte(word, 4)
sequence = sequence .. "\\" .. c4
if (c1 >= 240) and (c1 <= 244) then
-- UTF8-4
return sequence
end
return nil
end
local function getUtf8Table(word)
if ((word == nil) or (Grichelde.F.type(word) ~= "string") or (Grichelde.F.lengthUtf8(word) ~= 1)) then
return nil
end
--[[
You could use the following code snippet to iterate over UTF-8 sequences
(this will simply skip over most invalid codes):
for uchar in string.gmatch(ustring, "([%z\1-\127\194-\244][\128-\191]*)") do
...
end
]]--
local tbl = {}
local c1 = Grichelde.F.toByte(word, 1)
Grichelde.F.tInsert(tbl, "%z\\" .. c1)
local c2 = Grichelde.F.toByte(word, 2)
if (c1 >= 194) and (c1 <= 223) then
-- UTF8-2
Grichelde.F.tInsert(tbl, "\\" .. c2)
end
local c3 = Grichelde.F.toByte(word, 3)
if (c1 >= 224) and (c1 <= 239) then
-- UTF8-3
Grichelde.F.tInsert(tbl, "\\" .. c3)
end
local c4 = Grichelde.F.toByte(word, 4)
if (c1 >= 240) and (c1 <= 244) then
-- UTF8-4
Grichelde.F.tInsert(tbl, "\\" .. c4)
end
return tbl
end
local function isLetter(word) local function isLetter(word)
local char = Grichelde.F.getNextCharUtf8(word) local char = Grichelde.F.getNextCharUtf8(word)
return (char ~= nil) and (Grichelde.F.toUpper(char) ~= Grichelde.F.toLower(char)) return (char ~= nil) and (Grichelde.F.toUpper(char) ~= Grichelde.F.toLower(char))
@ -418,6 +509,9 @@ Grichelde.F = {
toUpper = _G.strupper, toUpper = _G.strupper,
toLower = _G.strlower, toLower = _G.strlower,
getNextCharUtf8 = getNextCharUtf8, getNextCharUtf8 = getNextCharUtf8,
isUtf8MultiByte = isUtf8MultiByte,
getUtf8Sequence = getUtf8Sequence,
getUtf8Table = getUtf8Table,
isLetter = isLetter, isLetter = isLetter,
isNumber = isNumber, isNumber = isNumber,
isUpper = isUpper, isUpper = isUpper,

@ -72,7 +72,7 @@ function Grichelde:RunTests()
["OSSO"] = "OCHO", ["OSSO"] = "OCHO",
["ooSS"] = "ooCH", ["ooSS"] = "ooCH",
["schmeissen"] = "chmeichen", ["schmeissen"] = "chmeichen",
["Schön"] = "Chön", ["Sch&#246;n"] = "Chön",
} }
) )
ok = ok + o ok = ok + o
@ -776,6 +776,56 @@ function Grichelde:RunTests()
ok = ok + o ok = ok + o
all = all + a all = all + a
o, a = test(
"umlauts",
{
replacement_10 = {
order = 10,
searchText = "ä",
replaceText = "ae",
exactCase = false,
consolidate = false,
matchWhen = 2,
stopOnMatch = false,
},
replacement_11 = {
order = 11,
searchText = "ö",
replaceText = "oe",
exactCase = false,
consolidate = false,
matchWhen = 2,
stopOnMatch = false,
},
replacement_12 = {
order = 12,
searchText = "ü",
replaceText = "ue",
exactCase = false,
consolidate = false,
matchWhen = 2,
stopOnMatch = false,
},
replacement_13 = {
order = 13,
searchText = "ß",
replaceText = "ss",
exactCase = false,
consolidate = false,
matchWhen = 2,
stopOnMatch = false,
},
},
{
["Hallä"] = "Hallae",
["Ätsch"] = "Aetsch",
["Hällöleü"] = "Haelloeleue",
["ÜöÄ"] = "UeoeAe",
}
)
ok = ok + o
all = all + a
if (ok == all) then if (ok == all) then
self:PrefixedPrint("All %d tests %s", all, cGreen("passed")) self:PrefixedPrint("All %d tests %s", all, cGreen("passed"))
else else

Loading…
Cancel
Save