Module:Sandbox/AbstractWikipedia/TemplateParser

Module documentation


This module is part of user:AGutman-WMF's prototype implementation of Abstract Wikipedia's template language in Scribunto.

This module specifically implements a template-language parser. Its parse function returns three return values, described below.

An structured representation of the template elements

edit

This is a list of tables, each table representing a single element of the template (either a slot or a textual element). The order of the list corresponds to the order of the elements in the table. The table have the following fields:

  • type :
For textual elements this can be punctuation, spacing or text.
For slots, this can be one of function, lexeme (for L-ids), item (for Q-ids), interpolation (for identifiers which are assumed to interpolate an argument), text (quoted strings within slots), number (an integer number) or undefined (yielding later an error).
  • text: for all above types, except function and interpolation, this field contains the content of the slot or the textual element (unquoted if it is text within a slot).
  • function: used only in the function type, this field represent the name of the function.
  • args: used only in the function type, this is a (possibly-empty) list of the arguments to the function, which are themselves tables of the type discussed here. Given that an argument to a function may be a function itself, this yields a tree representation of the function arguments.
  • arg: used only in the interpolation type, this field represent the name of the interpolated argument.
  • role: For slots with a dependency label, this contains the grammatical role indicated by the label (i.e. the label without any additional indexes or source indication.
  • index: for debugging purposes, this gives the sequential number of the element within the template (only for top-level elements). These indexes are referred to by the relations table (described below).

Relations table

edit

The relations table is a list of tables indicating a relation to be applied on the slots of the template. The order is immaterial. Each table in the list has the following three fields:

  • role: the name of the relation to be applied.
  • target: the index of the target slot (i.e. the slot where the label is given).
  • source: the index of the source slot (by default the root slot, unless another source label is given).

Root of the template

edit

The last return value is the index of the slot marked as `root`. If none is given (which is only allowed if dependency labels are not used at all), this will be the index of the first slot (or in the absence of slots, this will be the index of the last element).


local p = {}

function segmentize ( template )
	local segments = {}
	local segment = ""
	local insideSlot = false
	function pushSegmentIfNotEmpty()
		if (segment ~= "") then
			table.insert(segments, segment)
			segment = ""
		end
	end
	for char in template:gmatch"." do
		if (not insideSlot) then
			if (char == '{') then
				pushSegmentIfNotEmpty()
				segment = '{'
				insideSlot = true
			elseif (char:match"%p") then
				if (segment:match"^%p*$") then
					segment = segment..char
				else
					pushSegmentIfNotEmpty()
					segment = char
				end
			elseif (char:match"%s") then
				if (segment:match"^%s*$") then
					segment = segment..char
				else
					pushSegmentIfNotEmpty()
					segment = char
				end
			else
				segment = segment..char
			end
		else  -- inside slot
			if (char == '}') then
				segment = segment..char
				pushSegmentIfNotEmpty()
				insideSlot = false
			else
				segment = segment..char
			end
		end
	end
	if (insideSlot) then
		error("Template ends without closing a slot!")
	end
	pushSegmentIfNotEmpty()
	return segments
end

function isSlot( segment )
	return segment:match("{.+}")
end

-- Remove index part of label, e.g. "subj" from "subj_1"
local function getRole ( label )
	return label:match("^%a+")
end

function breakDownSlot ( slot )
	local result = {}
	-- strip {} characters
	slot = slot:sub(2, -2)
	local colon = slot:find(':')
	if (colon) then
		result.invocation = slot:sub(colon+1)
		local label = slot:sub(1, colon-1)
		local source_indicator = label:find('<')
		if (source_indicator) then
			result.source = label:sub(source_indicator+1)
			label = label:sub(1, source_indicator-1)
		end
		result.role =  getRole(label)
		result.label = label
	else
		result.invocation = slot
	end
	return result
end

-- Helper funciton that try to match string with pattern and returns true/false
-- The actual match and its length are given in result
local function matches( str, pattern, result)
	-- Anchor the pattern at the beginning of the string and allow spaces around
	pattern = "^%s*(" .. pattern .. ")%s*"
	_, end_pos, match = str:find(pattern)
	if match then
		result.match = match
		result.length = end_pos
		return true
	end
	return false
end

function parseInvocation ( invocation )
	local result = {}
	local match = { match = '', length = 0}
	if matches(invocation, "%a+%b()", match) then -- function invocation
		-- for proto-typing simplicity, we current support at most
		-- a single function argument
		result.type = 'function'
		local pos = match.match:find('%(')
		result['function'] = match.match:sub(1, pos-1)
		-- Parse comma-seperated list of arguments which can themselves be a
		-- function invocaiton, an interpolation etc.
		result.args = {}
		local remaining_args = match.match:sub(pos+1, -2)
		while remaining_args ~= '' do
			local arg, length = parseInvocation(remaining_args)
			if length == 0 then
				error("Unknown element: "..remaining_args)
			end
			table.insert(result.args, arg)
			local next_pos = pos + length + 1
			local next_char = match.match:sub(next_pos, next_pos) 
			if next_char == ')' then
				remaining_args = ''
			elseif next_char == ',' then
				pos = next_pos
				remaining_args = match.match:sub(pos+1, -2)
			else
				error("Unexpected element in function invocation: "..match.match:sub(pos, -2))
			end
		end
	elseif matches(invocation, "L%d+", match) then
		result.type = 'lexeme'
		result.text = match.match
	elseif matches(invocation, "Q%d+", match) then
		result.type = 'item'
		result.text = match.match
	elseif matches(invocation, "[%a_]+", match) then
		result.type = 'interpolation'
		result.arg = match.match
	elseif matches(invocation, '%"[^%"]*%"', match) then
		result.type = 'text'
		result.text = match.match:sub(2,-2)
	elseif matches(invocation, "[+-]?%d+", match) then
		result.type = 'number'
		result.text = match.match
	else
		result.type = 'undefined'
		result.text = invocation
	end
	return result, match.length
end

function enumerateRelations (labelIndexMap, labelSourceMap, rootIndex)		
	local relations = {}
	for label, index in pairs (labelIndexMap) do
		local relation = {}
		relation.role = getRole(label)  -- remove index part, e.g. subj_1
		relation.target = index
		if (labelSourceMap[label]) then
			source = labelIndexMap[labelSourceMap[label]]
			if (not source) then
				error("Source label not found: "..labelSourceMap[label])
			end
			relation.source = source
		else
			relation.source = rootIndex
		end
		table.insert(relations, relation)
	end
	return relations
end

function p.parse ( template )
	local elements = {}
	local labelIndexMap = {}
	local labelSourceMap = {}
	local rootSlot = nil
	local firstSlot = nil
	
	for index, segment in pairs(segmentize(template)) do
		if (isSlot(segment)) then
			if not firstSlot then
				firstSlot = index
			end
			slotComponents = breakDownSlot(segment)
			local new_element, length = parseInvocation(slotComponents.invocation)
			if (length ~= #slotComponents.invocation) then
				extra = slotComponents.invocation:sub(length+1)
				-- We allow extrawhite space
				if not extra:match("^%s+$") then
					error("Unexpected element: "..slotComponents.invocation:sub(length+1))
				end
			end
			new_element.index = index
			new_element.role = slotComponents.role
			table.insert(elements, new_element)
			if (slotComponents.label) then
				local label = slotComponents.label
				if (label == 'root') then
					if (rootSlot) then
						error("Duplicate root label at position "..index..".")
					else
						rootSlot = index
					end
				else
					labelIndexMap[label] = index
					if (slotComponents.source) then
						labelSourceMap[label] = slotComponents.source
					end
				end
			end
		elseif segment:match("^%p+$") then
			table.insert(elements, {type='punctuation', text=segment, index=index})
		elseif segment:match("^%s+$") then
			table.insert(elements, {type='spacing', text=segment, index=index})
		else
			table.insert(elements, {type='text', text=segment, index=index})
		end
	end
	
	if (not rootSlot) then
		if next(labelIndexMap) ~= nil then
			error("When using relations, you must specify root.")
		end
		-- We allow infering the root if no relations have been used
		rootSlot = firstSlot or #elements
	end
	
	local relations = enumerateRelations (labelIndexMap, labelSourceMap, rootSlot)
	mw.logObject(elements)
	return elements, relations, rootSlot
end

return p