Module:Sandbox/AbstractWikipedia/TemplateParser
This module is part of user:AGutman-WMF's prototype implementation of Abstract Wikipedia's template language in Scribunto.
This module specifically implements a template-language parser. Its parse
function returns three return values, described below.
An structured representation of the template elements
editThis is a list of tables, each table representing a single element of the template (either a slot or a textual element). The order of the list corresponds to the order of the elements in the table. The table have the following fields:
type
:
- For textual elements this can be
punctuation
,spacing
ortext
. - For slots, this can be one of
function
,lexeme
(for L-ids),item
(for Q-ids),interpolation
(for identifiers which are assumed to interpolate an argument),text
(quoted strings within slots),number
(an integer number) orundefined
(yielding later an error).
text
: for all above types, exceptfunction
andinterpolation
, this field contains the content of the slot or the textual element (unquoted if it is text within a slot).
function
: used only in thefunction
type, this field represent the name of the function.
args
: used only in thefunction
type, this is a (possibly-empty) list of the arguments to the function, which are themselves tables of the type discussed here. Given that an argument to a function may be a function itself, this yields a tree representation of the function arguments.
arg
: used only in theinterpolation
type, this field represent the name of the interpolated argument.
role
: For slots with a dependency label, this contains the grammatical role indicated by the label (i.e. the label without any additional indexes or source indication.
index
: for debugging purposes, this gives the sequential number of the element within the template (only for top-level elements). These indexes are referred to by the relations table (described below).
Relations table
editThe relations table is a list of tables indicating a relation to be applied on the slots of the template. The order is immaterial. Each table in the list has the following three fields:
role
: the name of the relation to be applied.target
: the index of the target slot (i.e. the slot where the label is given).source
: the index of the source slot (by default the root slot, unless another source label is given).
Root of the template
editThe last return value is the index of the slot marked as `root`. If none is given (which is only allowed if dependency labels are not used at all), this will be the index of the first slot (or in the absence of slots, this will be the index of the last element).
local p = {}
function segmentize ( template )
local segments = {}
local segment = ""
local insideSlot = false
function pushSegmentIfNotEmpty()
if (segment ~= "") then
table.insert(segments, segment)
segment = ""
end
end
for char in template:gmatch"." do
if (not insideSlot) then
if (char == '{') then
pushSegmentIfNotEmpty()
segment = '{'
insideSlot = true
elseif (char:match"%p") then
if (segment:match"^%p*$") then
segment = segment..char
else
pushSegmentIfNotEmpty()
segment = char
end
elseif (char:match"%s") then
if (segment:match"^%s*$") then
segment = segment..char
else
pushSegmentIfNotEmpty()
segment = char
end
else
segment = segment..char
end
else -- inside slot
if (char == '}') then
segment = segment..char
pushSegmentIfNotEmpty()
insideSlot = false
else
segment = segment..char
end
end
end
if (insideSlot) then
error("Template ends without closing a slot!")
end
pushSegmentIfNotEmpty()
return segments
end
function isSlot( segment )
return segment:match("{.+}")
end
-- Remove index part of label, e.g. "subj" from "subj_1"
local function getRole ( label )
return label:match("^%a+")
end
function breakDownSlot ( slot )
local result = {}
-- strip {} characters
slot = slot:sub(2, -2)
local colon = slot:find(':')
if (colon) then
result.invocation = slot:sub(colon+1)
local label = slot:sub(1, colon-1)
local source_indicator = label:find('<')
if (source_indicator) then
result.source = label:sub(source_indicator+1)
label = label:sub(1, source_indicator-1)
end
result.role = getRole(label)
result.label = label
else
result.invocation = slot
end
return result
end
-- Helper funciton that try to match string with pattern and returns true/false
-- The actual match and its length are given in result
local function matches( str, pattern, result)
-- Anchor the pattern at the beginning of the string and allow spaces around
pattern = "^%s*(" .. pattern .. ")%s*"
_, end_pos, match = str:find(pattern)
if match then
result.match = match
result.length = end_pos
return true
end
return false
end
function parseInvocation ( invocation )
local result = {}
local match = { match = '', length = 0}
if matches(invocation, "%a+%b()", match) then -- function invocation
-- for proto-typing simplicity, we current support at most
-- a single function argument
result.type = 'function'
local pos = match.match:find('%(')
result['function'] = match.match:sub(1, pos-1)
-- Parse comma-seperated list of arguments which can themselves be a
-- function invocaiton, an interpolation etc.
result.args = {}
local remaining_args = match.match:sub(pos+1, -2)
while remaining_args ~= '' do
local arg, length = parseInvocation(remaining_args)
if length == 0 then
error("Unknown element: "..remaining_args)
end
table.insert(result.args, arg)
local next_pos = pos + length + 1
local next_char = match.match:sub(next_pos, next_pos)
if next_char == ')' then
remaining_args = ''
elseif next_char == ',' then
pos = next_pos
remaining_args = match.match:sub(pos+1, -2)
else
error("Unexpected element in function invocation: "..match.match:sub(pos, -2))
end
end
elseif matches(invocation, "L%d+", match) then
result.type = 'lexeme'
result.text = match.match
elseif matches(invocation, "Q%d+", match) then
result.type = 'item'
result.text = match.match
elseif matches(invocation, "[%a_]+", match) then
result.type = 'interpolation'
result.arg = match.match
elseif matches(invocation, '%"[^%"]*%"', match) then
result.type = 'text'
result.text = match.match:sub(2,-2)
elseif matches(invocation, "[+-]?%d+", match) then
result.type = 'number'
result.text = match.match
else
result.type = 'undefined'
result.text = invocation
end
return result, match.length
end
function enumerateRelations (labelIndexMap, labelSourceMap, rootIndex)
local relations = {}
for label, index in pairs (labelIndexMap) do
local relation = {}
relation.role = getRole(label) -- remove index part, e.g. subj_1
relation.target = index
if (labelSourceMap[label]) then
source = labelIndexMap[labelSourceMap[label]]
if (not source) then
error("Source label not found: "..labelSourceMap[label])
end
relation.source = source
else
relation.source = rootIndex
end
table.insert(relations, relation)
end
return relations
end
function p.parse ( template )
local elements = {}
local labelIndexMap = {}
local labelSourceMap = {}
local rootSlot = nil
local firstSlot = nil
for index, segment in pairs(segmentize(template)) do
if (isSlot(segment)) then
if not firstSlot then
firstSlot = index
end
slotComponents = breakDownSlot(segment)
local new_element, length = parseInvocation(slotComponents.invocation)
if (length ~= #slotComponents.invocation) then
extra = slotComponents.invocation:sub(length+1)
-- We allow extrawhite space
if not extra:match("^%s+$") then
error("Unexpected element: "..slotComponents.invocation:sub(length+1))
end
end
new_element.index = index
new_element.role = slotComponents.role
table.insert(elements, new_element)
if (slotComponents.label) then
local label = slotComponents.label
if (label == 'root') then
if (rootSlot) then
error("Duplicate root label at position "..index..".")
else
rootSlot = index
end
else
labelIndexMap[label] = index
if (slotComponents.source) then
labelSourceMap[label] = slotComponents.source
end
end
end
elseif segment:match("^%p+$") then
table.insert(elements, {type='punctuation', text=segment, index=index})
elseif segment:match("^%s+$") then
table.insert(elements, {type='spacing', text=segment, index=index})
else
table.insert(elements, {type='text', text=segment, index=index})
end
end
if (not rootSlot) then
if next(labelIndexMap) ~= nil then
error("When using relations, you must specify root.")
end
-- We allow infering the root if no relations have been used
rootSlot = firstSlot or #elements
end
local relations = enumerateRelations (labelIndexMap, labelSourceMap, rootSlot)
mw.logObject(elements)
return elements, relations, rootSlot
end
return p