Module:Sandbox/AbstractWikipedia/Phonotactics/he
This is the Hebrew-phonotactics module of the Abstract Wikipedia Scribunto-based NLG prototype. For other phonotactics modules see the main page.
The Hebrew implementation takes care of certain orthographic and phonotactic alternations happening after Hebrew proclitics. It scans the list of lexemes and if a proclitic, identified by its lemma, is found, the following lexemes may be altered in the following ways:
- Spaces following proclitics are removed.
- The definite article (identified by its part-of-speech) is removed following certain proclitics.
- If the proclitic is followed by a number spelled out by digits, a hyphen is added, in accordance with Hebrew's orthographic conventions.
- If a proclitic is followed by a word starting with the letter Vav, that letter is doubled, in accordance with Hebrew's writing rules of unvocalized text.
local p = {}
local function isProclitic ( lexeme )
text = tostring(lexeme)
return text == 'מ' or text == 'ש' or text == 'ה' or text == 'כ' or text == 'ל' or text == 'ב' or text == 'ו'
end
function p.applyPhonotactics ( lexemes )
mw.log("Hebrew phonotactics")
local last_lexeme = nil
for index, lexeme in ipairs(lexemes) do
if isProclitic(last_lexeme) then
text = tostring(lexeme)
mw.log("After proclitic: '"..text.."' ("..lexeme.pos..")")
if lexeme.pos == 'spacing' then
-- Remove spacing after proclitics
lexeme.replaceByForm('')
elseif lexeme.pos == 'article' then
-- Omit article after certain prepositions
if (prep == 'ב' or prep == 'ל' or prep == 'כ') then
lexeme.replaceByForm('')
end
elseif text:match("^%d") then
-- Add hypen between proclitic and numbers
lexeme.replaceByForm('-'..text)
elseif last_lexeme.lemma ~= 'ו' and text:match("^ו") and not text:match("^וו") then
lexeme.replaceByForm('ו'..text)
end
end
if lexeme.pos ~= 'spacing' and tostring(lexeme) ~= '' then
last_lexeme = lexeme
end
end
end
return p