Modul:fa-cls-translit
A modult a Modul:fa-cls-translit/doc lapon tudod dokumentálni
-- Authors: Sameerhameedy
local U = mw.ustring.char
local gsub = mw.ustring.gsub
local export = {}
local fatHataan = U(0x64B) -- اً, tanvin-e nasb (تنوین نصب)
local Dammataan = U(0x64C) -- un
local kasrataan = U(0x64D) -- in
local zabar = U(0x64E)
local zer = U(0x650)
local pesh = U(0x64F)
local tashdid = U(0x651) -- also called shadda
local jazm = "ْ"
local he = "ه"
local zwnj = U(0x200C)
local highhmz = U(0x654)
local lrm = U(0x200e) -- left-to-right mark
local rlm = U(0x200f) -- right-to-left mark
local balticons = "ڃڇڑڗݜݨݩǩ"
local consonants = "بپتټٹثجچحخدډڈذرزژسشصضطظعغفقکگلمنؤهئء" .. balticons
local consonants2 = "ءبپتټٹثجچحخدډڈذرزژسشصضطظعغفقکگلمنوؤهیئyw" .. balticons -- including semivowels
local vowels = "āēīōū"
local semivowel = "یو"
local hes = "هح"
local diacritics = "َُِّْٰ"
local ZZP = "َُِ"
local alif_wasla = "ٱ"
local space_like = "%s'" .. '"'
local space_like_class = "[" .. space_like .. zwnj .. "]"
--- The characters ټ ٹ ډ ڈ ے are included only for Mughal Persian and Hazaragi.
local mapping = {
["آ"] = 'ā', ["ب"] = 'b', ["پ"] = 'p', ["ت"] = 't', ["ث"] = 's',
["ج"] = 'j', ["چ"] = 'č', ["ح"] = 'h', ["خ"] = 'x',
["د"] = 'd', ["ذ"] = 'z', ["ر"] = 'r', ["ز"] = 'z', ["ژ"] = 'ž',
["س"] = 's', ["ش"] = 'š', ["ص"] = 's', ["ض"] = 'z',
["ط"] = 't', ["ظ"] = 'z', ["غ"] = 'ğ', ["ف"] = 'f', ["ق"] = 'q',
["ک"] = 'k', ["گ"] = 'g',
["ل"] = 'l', ["م"] = 'm', ["ن"] = 'n',
["و"] = 'ō', ["ی"] = 'ē', ["۔"] = ".",
["ه"] = "h",
["ع"] = "'",
["ء"] = "'",
["ئ"] = "'",
["ؤ"] = "'",
["أ"] = "'",
-- diacritics
[zabar] = "a",
[zer] = "i",
[pesh] = "u",
[jazm] = "", -- also sukun - no vowel
[zwnj] = "-", -- ZWNJ (zero-width non-joiner)
[highhmz] = "-yi",
-- ligatures
["ﻻ"] = "lā",
["ﷲ"] = "allāh",
-- kashida
["ـ"] = "-", -- kashida, no sound
-- alif_wasla
[alif_wasla] = "", -- nothing
-- numerals
["۱"] = "1", ["۲"] = "2", ["۳"] = "3", ["۴"] = "4", ["۵"] = "5",
["۶"] = "6", ["۷"] = "7", ["۸"] = "8", ["۹"] = "9", ["۰"] = "0",
-- punctuation (leave on separate lines)
["؟"] = "?", -- question mark
["،"] = ",", -- comma
["؛"] = ";", -- semicolon
["«"] = '“', -- quotation mark
["»"] = '”', -- quotation mark
["٪"] = "%", -- percent
["؉"] = "‰", -- per mille
["٫"] = ".", -- decimals
["٬"] = ",", -- thousand
-- regional characters (FOR VERY SPECIFIC USECASES)
["ټ"] = "ṭ", ["ٹ"] = "ṭ", ["ډ"] = "ḍ", ["ڈ"] = "ḍ",
-- balti
-- cant do anything about ژ because it conflicts with persian
["ڃ"]= "ž",
["ڇ"] = "č̣",
["ڑ"] = "ṛ",
["ڗ"] = "dz",
["ݜ"] = "ṣ",
["ݨ"] = "ng",
["ݩ"] = "ny",
["ھ"] = "h",
["ے"] = "e",
}
local punctuation = ":%(%)%[%]*&٫؛؟،ـ«\".\'!»٪؉۔`,/"
local numbers = "۱۲۳۴۵۶۷۸۹۰"
local ain = 'ع'
local alif = 'ا'
local malif = "آ"
local hamza = "ء"
local ye = 'ی'
local ye2 = 'ئ'
local vao = "و"
local dagger_alif = U(0x670)
local marbuta = U(0x629)
local te = "ت"
local ye3 = "ے"
local laam = "ل"
local vowel = "[" .. vowels .. ZZP .. jazm .. semivowel .. malif .."]"
local sun_letters = "تثدذرزسشصضطظلن"
local before_diacritic_checking_subs = {
------------ transformations prior to checking for diacritics --------------
{U(0x06E5), "و"},
{U(0x06E6), "ی"},
{ alif_wasla, ""},
{ 'ہ', he}, -- get rid of balti he (allows balti to transliterate)
{ 'ک' .. highhmz, "ǩ"},
{"([" .. fatHataan .. ZZP .. dagger_alif .. "])" .. tashdid, tashdid .. "%1"},
{ alif .. fatHataan, zabar .. "ن"},
{ fatHataan .. alif, zabar .. "ن"},
{ jazm .. ye .. dagger_alif , jazm .. ye.. zabar .. alif},
{ zabar .. ye .. dagger_alif , zabar .. alif},
{ ye .. dagger_alif , zabar .. alif}, -- the first letter is U+06CC
{ ye3 , ye},
{ "[أإ]" , ye2},
{ zabar .. dagger_alif, zabar .. alif},
{ dagger_alif, zabar .. alif},
{ fatHataan, zabar .. "ن"}, -- fatḥatan
{ Dammataan, pesh .. "ن"}, -- ḍammatan
{ kasrataan, zer .. "ن"}, -- kasratan
{ marbuta .. "([" .. ZZP .. "])" .. alif, te .. "%1-"},
{ marbuta .. "([" .. ZZP .. jazm .. "])", te .. "%1"},
{ marbuta , he},
-- allah ligatures and arabic al
{"([" .. consonants2 .. "][".. ZZP .."])(" .. space_like_class .. ")" .. alif .. laam .. "([" .. jazm .. laam .. "])" , "%1%2" .. laam .. "%3" },
{ laam .. laam .. tashdid , laam .. tashdid},
-- use jazm/sukoon to prevent this conversion
{ "(خ)" .. vao .. zabar .. alif , "%1" .. zabar .. alif},
{ "(خ)" .. vao .. zabar , "%1" .. pesh},
{ "(خ)" .. vao .. ye .. "([^" .. ZZP .. jazm .. "])" , "%1" .. ye .. "%2"},
-- izāfa
{ zwnj, "-"},
{ jazm .. alif, "-"},-- vowel killing, invisible ZWNJ
{ zabar .. jazm, "-"},-- vowel killing, invisible ZWNJ
}
local has_diacritics_subs = {
-- remove punctuation and tashdid
{ "[" .. punctuation .. tashdid .. highhmz .. numbers .. fatHataan .."]", ""},
{"[" .. consonants .. "]$", ""},
{"[" .. consonants .. "](" .. space_like_class .. ")", "%1"},
{"[" .. consonants .. "]%-", "-"},
-- these are required for arabic al- to work
{"[" .. consonants2 .. "]" .. "([".. zer .. pesh .."])" .. alif .. laam , laam },
{"[" .. consonants2 .. "]([".. zer .. pesh .."])%-" .. alif .. laam , laam },
-- remove CV pairs
-- consonants paired to alif
{ "[" .. consonants2 .. "]" .. jazm , ""},
{ "[" .. consonants2 .. "]" .. jazm .. malif, ""},
{ "[" .. consonants2 .. "]" .. zabar .. alif, ""},
-- consonants paired to a semivowel
{ "[" .. consonants .. alif .. "][" .. semivowel .. ZZP .. "]([" .. semivowel .. "])([" .. semivowel .. "])", "%1%2"},
{ "[" .. consonants2 .. alif .. "][" .. ZZP .. "][" .. semivowel .. "]", ""},
{ "[" .. consonants2 .. alif .. "][" .. ZZP .. jazm .. semivowel .. "]", ""},
{ "[" .. alif .. consonants2 .. "][" .. ZZP .. "][" .. semivowel .. "]", ""},
{ malif , ""}, -- counts as a CV pair
{ jazm .. alif .. "[" .. ZZP .. "]", ""},
{ "[" .. consonants2 .. alif .."][" .. ZZP .. "]", ""},
{ "[" .. consonants2 .. alif .. semivowel .. "][" .. semivowel .. "]", ""},
-- remove numbers, hamzatu l-waṣl, alif madda and ZWNJ
{ "[" .. numbers .. "ٱ" .. "آ" .. "]", ""},
{ "%s", ""},
{ "%-", ""},
{ "[" .. semivowel .. "]", ""},
{ "(" .. vowel .. ")", ""},
}
local function has_diacritics(text)
local count
text, count = gsub(text, "[" .. lrm .. rlm .. "]", "")
if count > 0 then
require("Module:debug").track("fa-translit/lrm or rlm")
end
for _, sub in ipairs(has_diacritics_subs) do
text = gsub(text, unpack(sub))
end
return #text == 0
end
function export.tr(text, lang, sc)
if type(text) == "table" then
local function f(x) return (x ~= "") and x or nil end
text, lang, sc, omit_i3raab, force_translit =
f(text.args[1]), f(text.args[2]), f(text.args[3]), f(text.args[4]), f(text.args[5])
end
for _, sub in ipairs(before_diacritic_checking_subs) do
text = gsub(text, sub[1], sub[2])
end
if not force_translit and not has_diacritics(text) then
require("Module:debug").track("fa-translit/lacking diacritics")
return nil
end
--define the "end" of a word
text = gsub(text, "#", "HASHTAG")
text = gsub(text, "^", "#")
text = gsub(text, "$", "#")
text = gsub(text, " | ", "# | #")
text = gsub(text, "%s", "# #")
text = gsub(text, "\n" , "#".."\n" .. "#")
text = gsub(text, "(["..punctuation.."])" , "#".."%1" .. "#")
text = "##" .. gsub(text, " ", "# #") .. "##"
text = gsub(text, "%-", "#-#")
-- hastags now mark the beginning and end of a word
--character reformatting and exceptions
text = gsub(text, highhmz, "#"..highhmz.."#")
text = gsub(text, "#" .. vao .. "#", "#u#")
-- prevent izafa from converting until later
-- Tashdeed
text = gsub(text, '([' .. consonants .. '])' .. tashdid, "%1%1")
text = gsub(text, '([' .. consonants .. '])' .. tashdid .. '([' .. ZZP .. '])', "%1%1%2")
text = gsub(text, '([' .. consonants .. '])' .. '([' .. ZZP .. '])' .. tashdid, "%1%1%2")
text = gsub(text, ye .. '([' .. ZZP .. '])' .. tashdid, "yy%1")
text = gsub(text, vao .. '([' .. ZZP .. '])' .. tashdid, "ww%1")
text = gsub(text, ye .. tashdid .. '([' .. ZZP .. '])', "yy%1")
text = gsub(text, vao .. tashdid .. '([' .. ZZP .. '])', "ww%1")
-- distinguish initial alif from vowel alif
text = gsub(text, "(["..consonants2.."])" .. zabar .. alif, "%1ā")
text = gsub(text, "(["..consonants2.."])" .. alif, "%1ā")
text = gsub(text, jazm .. malif, "'ā")-- invisible ZWNJ
text = gsub(text, "(["..consonants2.."])" .. malif, "%1'ā")
text = gsub(text, alif..ye, "ē")
text = gsub(text, alif..vao, "ō")
text = gsub(text, alif..zer..ye, "ī")
text = gsub(text, alif..pesh..vao, "ū")
text = gsub(text, tashdid .. alif, tashdid .. "ā")
-- convert semi vowels
text = gsub(text, ye .. "ā", "yā")
text = gsub(text, vao.. "ā", "wā")
text = gsub(text, vao.. "(["..diacritics..ZZP.."])", "w%1")
text = gsub(text, ye.. "(["..diacritics..ZZP.."])", "y%1")
text = gsub(text, ye .. "(["..semivowel.."])(["..semivowel.."])", "ē%1%2")
text = gsub(text, vao .. "(["..semivowel.."])(["..semivowel.."])", "ō%1%2")
text = gsub(text, "(["..diacritics..ZZP.."])" .. ye .. "(["..semivowel.."])", "%1y%2")
text = gsub(text, "(["..diacritics..ZZP.."])" .. vao .. "(["..semivowel.."])", "%1w%2")
text = gsub(text, "(["..consonants.."])" .. ye .. "(["..semivowel.."])", "%1y%2")
text = gsub(text, "(["..consonants.."])" .. vao .. "(["..semivowel.."])", "%1w%2")
-- conversions for vaav/waaw/vao
text = gsub(text, pesh.. vao, "ū")
text = gsub(text, vao.. "(["..diacritics..ZZP.."])", "w%1")
text = gsub(text, "(" .. vowel .. ")" .. vao, "%1w")
-- conversions for ye
text = gsub(text, zer.. ye, "ī")
text = gsub(text, ye .. "(["..diacritics..ZZP.."])", "y%1")
text = gsub(text, "(" .. vowel ..")" .. ye , "%1y")
--Alif with short vowel
text = gsub(text, alif.."(["..ZZP.."])", "%1")
-- final changes
-- izafa
text = gsub(text, "ē" .. zer .. "#", "ē-yi#")
text = gsub(text, zer .. "y" .. zer .. "#", "ī-yi#")
text = gsub(text, '([^' .. consonants .. '])' .. "y" .. zer .. "#", "%1-yi#")
text = gsub(text, '([' .. consonants2 .. '])' .. zer .. "#", "%1-i#")
-- arabic al, must happen after alif conversions and before he deletion
text = gsub(text, "([".. ZZP .."])#%-#" .. alif .. laam , "%1#-#" .. laam )
text = gsub(text, "([".. ZZP .."])" .. alif .. laam .. jazm .. "([".. sun_letters .."])" .. "%2" , "%1%2%2")
text = gsub(text, "([".. ZZP .."])" .. alif .. laam .. laam , "%1#-#" .. laam .. laam)
text = gsub(text, "([".. ZZP .."])" .. alif .. laam , "%1#-#" .. laam .. "#-#")
text = gsub(text, "#([".. ZZP .."]?)" .. laam .. jazm .. "([".. sun_letters .."])" .. "%2" , "#%1%2%2")
-- he deletion
text = gsub(text, "(["..ZZP.."])" .. he .. "#" .. zwnj, "%1-")
text = gsub(text, "(["..ZZP.."])" .. he .. "#", "%1#")
-- get rid of hashtags (not needed)
text = gsub(text, "#", "")
text = gsub(text, "HASHTAG", "#")
text = string.gsub(text, lrm, "")
text = string.gsub(text, rlm, "")
-- convert all characters
text = mw.ustring.gsub(text, '.', mapping)
-- alif
-- Final corrections
text = mw.ustring.gsub(text, "āa", "ā")
text = mw.ustring.gsub(text, "aaa", "ā")
text = mw.ustring.gsub(text, "āā", "ā")
text = mw.ustring.gsub(text, "aa", "ā")
text = mw.ustring.toNFC(text)
return text
end
return export