Module:Sandbox/AbstractWikipedia/Phonotactics/en
This is the English-phonotactics module of the Abstract Wikipedia Scribunto-based NLG prototype. For other phonotactics modules see the main page.
The English implementation scans the lexemes for the indefinite article a (identified by its lemma and part-of-speech). If such a lexeme is found, the following lexeme (ignoring spacing and empty forms) is inspected, and if it starts with a vowel, the article's form is replaced by the form an. Note that in the current implementation, a simple list of regular expression is used to determine whether a form starts with a vowel. Ideally, this information should be stored and fetched from Wikidata for each lexeme.
local p = {}
-- Ideally lexemes should be annotated for their phonetic makeup, but currently
-- we use a simple heuristic
local function startsWithVowel ( text )
text = mw.getLanguage("en"):lc(text)
local patterns = { "[aio]", "e[^u]", "un[^i]" } -- could be more exact
for _, pattern in ipairs(patterns) do
if text:match("^"..pattern) then
return true
end
end
return false
end
-- Returns the first non-empty, non-spacing lexeme after index
local function followingLexeme ( lexemes, index )
for i=index+1, #lexemes do
if (lexemes[i].pos ~= 'spacing' and tostring(lexemes[i] ~= '')) then
return lexemes[i]
end
end
return nil
end
function p.applyPhonotactics ( lexemes )
for index, lexeme in ipairs(lexemes) do
-- There is a single phonotactic rule in English: a->an
if (lexeme.pos == 'article' and lexeme.lemma == 'a') then
if startsWithVowel(tostring(followingLexeme(lexemes, index))) then
-- Clear all forms and add 'an'
lexeme.replaceByForm('an')
mw.log("Phonotactics module modified indefinite article")
lexeme.log()
end
end
end
end
return p