Modul:FParser
Belgekirina modulê[biafirîne]
local lexer = {}
local parser = {}
--[[ These parser functions are generic functions to build a parser.
It works with "Lexing" and "Parsing" functions.
The parser is initialized by the "parse" function, who generates a "state" object that must be a parameter of each parsing functions,
and eventually returns the main node of the AST if everything goes well
* Lexing functions have one parameter, the state of the parser, and returns a modified state
if they could find the terminals they are supposed to recognize and or nothing if the parse fails.
* Parsing functions always have a state as unique parameter.
They can be divided into
* Generic one, written in this module that corresponds that helps to build a parser
but don't generate nodes of the AST themselves, like "alternative", "chain", "star" or "plus"
* "chain" corresponds to the concatenation operation in a grammar. for example a function that parses the EBNF rule
twelve = "1", "2" ;
will be generated by chain{lex_char("1"), lex_char("2")}
* "alternative" corresponds to the alternative operation in a grammar, for example a function that parses the EBNF rule
digit_excluding_zero = "1" | "2" | "3" | "4" | "5" | "6" | "7" | "8" | "9" ;
will be written alternative{lex_char("1"), lex_char("2"), lex_char("3"), lex_char("4"), lex_char("5"), lex_char("6"),
lex_char("7"), lex_char("8"), lex_char("9")}
* User written one, that are functions that must generate a node of the AST as a "node" attribute of the "state" object.
To do that they must use attributes of the state object filled by the generic one like
* "parsed", the string parsed by the last lexing function,
* "acc", the list of nodes that are generated by the function that are passed to function that iterates their call namely "star" and "plus"
They return nothing if the parse fails, or the state of the last state returned by a lexer function called if they don't.
Other functions are
* maybe : a function to avoid checking if the state is nil every time : if the state is in a fail state,
it does not apply the parsing or lexing function. Maybe allows to chain the application of lexing and parsing function easily.
attribute of the "newstate" object to be able to access the "acc" or "node" of the previous object in a chain, for example.
* idop : a function that takes a function as parameter that is used to compute an action on the current state in a chain of parsing function,
mainly modifying variables in the closure. Functions passed to idop returns nothing. Usable for example for logging.
* statemodop : a function that takes a function as parameter ; who computes a node in the AST from the state and the variables of the closure and typically adding
it to the state.
Those functions have the same signatures than parsing functions, but typically they do not parse anything.
--]]
--------------------------------------------------------------------------------
-- lexer
--------------------------------------------------------------------------------
local function lex_regex(state, regex)
res = string.match(state.newstate.str, "^" .. regex)
if res then
local newstate = {}
local len = string.len(res)
newstate.str = state.newstate.str:sub(len + 1)
newstate.pos = state.newstate.pos + len
return {
['lexed'] = res,
["newstate"] = newstate,
["node"] = state.node -- forwarding node in case it's not consumed
-- case of lexing a closing parenthesis after the node creation.
-- this is to avoid to have to create a closure variable
}
else
return nil
end
end
lexer.lex_regex = lex_regex
local function lex_epsilon(state)
return lex_regex(state, "")
end
--[[
Tests: p.parse("a", p.chain{p.lexer.lex_epsilon, p.lex_char("a"),p.lexer.lex_epsilon})
--]]
lexer.lex_epsilon = lex_epsilon
local lex_char = function(char)
return function(state)
return lex_regex(state, char)
end
end
-- tested with "p.parse("/", p.lex_char('/'))"
lexer.lex_char = lex_char
function lexer.open_parenthesis(state)
return lex_regex(state, "[(]")
end
function lexer.close_parenthesis(state)
return lex_regex(state, "[)]")
end
function lexer.lex_integer(state)
return lex_regex(state, "[0-9]+")
end
function lexer.eat_blanks(state)
return lex_regex(state,' *')
end
parser.lexer = lexer
-------------------------------------------------------
-- generic parser property
----------------------------------------------------------
local function maybe(func)
return function(state)
if state then
return func(state)
end
end
end
parser.maybe = maybe
local function accumulate(aggregate_func)
return function(state)
return {
["newstate"]=state.newstate,
["node"]= aggregate_func(state.acc)
}
end
end
parser.accumulate = accumulate
local function map_node(map_func)
return function(state)
if state then
return {
["newstate"] = state.newstate,
["node"] = map_func(state.node)
}
end
end
end
parser.map_node = map_node
local function idop(func)
return function(state)
if state then
func(state)
return state
end
end
end
parser.idop = idop
-- logs
local show = function (string) return idop(function(state) mw.log(string) end) end
parser.log = show
parser.show = show
local dump_state = idop(function(state) mw.logObject(state) end)
parser.dump_state = dump_state
-- this allows avoiding to pass the state beetween each functions if they were called by hand
local function chain(funcs)
return function(state)
local i = 1
local res = funcs[1](state)
while i < #funcs and res do
i = i+1
if funcs[i] == nil then
error("a nil func in chain")
end
res = funcs[i](res)
end
return res
end
end
--[[
Tests :
p.parse("aba", p.chain{p.lex_char('a'), p.lex_char('b'), p.lex_char('a')}) => yes
p.parse("aba", p.chain{p.lex_char('a'), p.lex_char('b')}) => nope
p.parse("aba", p.chain{p.lex_char('a'), p.lex_char('b'), p.lex_char('a'), p.lex_char('a')}) => nope
--]]
parser.chain = chain
-- higher order function that can parse an alternative between several non terminals.
-- returns the state of the first match
local function alternative(funcs)
return function(state)
local i = 1
while i <= #funcs do
local res = funcs[i](state)
if res then
return res
end
i=i+1
end
end
end
--[[
Tests :
p.parse("a", p.alternative{p.lex_char('a'), p.lex_char('b')}) => yes
p.parse("b", p.alternative{p.lex_char('a'), p.lex_char('b')}) => yes
p.parse("c", p.alternative{p.lex_char('a'), p.lex_char('b')}) => nope
--]]
parser.alternative = alternative
function star(parsing_func)
local function star_rec(state, acc, i)
res = chain{
parsing_func,
idop(
function (stat)
table.insert(acc, stat.node)
end
),
}(state)
if res then
return star_rec(res, acc, i + 1)
else
return state, acc
end
end
return function(state)
if state then
local acc = {}
result, acc2 = star_rec(state, acc, 1)
result.acc = acc2
return result
end
end
end
--[[
Tests: p.parse("aaab", p.chain{p.star(p.lex_char("a")), p.lex_char("b")}) => yes
p.parse("b", p.chain{p.star(p.lex_char("a")), p.lex_char("b")}) => yes
--]]
parser.star = star
function plus(parsing_func)
return function(state)
local firstnode
local acc
return chain{
parsing_func,
idop(
function(state)
firstnode = state.node
end
),
star(parsing_func),
function(state)
acc = state.acc
table.insert(acc, 1, firstnode)
state.acc = acc
return state
end
}(state)
end
end
--[[
res = p.parse("aaab",
p.chain{
p.plus(
p.chain{
p.lex_char("a"),
p.statemodop(function(res) res.node="a" ; return res; end)
}
),
p.lex_char("b")
}
)
--]]
parser.plus = plus
--[[
Tests :
p.parse("a", p.chain{p.lex_char('a'), p.idop(function (state) end )}) => yes
p.parse("ab", p.chain{p.lex_char('a'), p.idop(function (state) end), p.lex_char('b') }) => nope
p.parse("ab", p.chain{p.lex_char('a'), p.idop(function (state) end), p.lex_char('a') }) => nope
--]]
function questionmark(parsing_func)
return function(state)
local node = nil
local res=alternative{
chain{
parsing_func,
idop(function (stat) node = stat.node end)
},
lex_epsilon
}(state)
res.node = node
return res
end
end
parser.questionmark = questionmark
--------------------------------------------------------------------------------
-- main function that launches the parsing
--------------------------------------------------------------------------------
parser.parse = function (string, base_parsing_function)
local state = {}
state.newstate = {}
state.newstate.str = string
state.newstate.pos = 1
local res_node = nil
local res = chain{
base_parsing_function,
idop(function(stat) res_node = stat.node end),
lexer.eat_blanks
}(state)
if res and res.newstate.str=="" then
return res_node
end
return nil
end
-- a function to create an AST node which represents an nary operator. Could parse an expression on the form « 1 + 2 + 3 + 4 » and return a « sum » AST node wich have a list of nodes for operands sum([number(1), number(2), number(3) number(4) ) in our example
local function nary_op_parser (
first_node_parser, -- a parser that parses only the first element of the operation (« 1 »)
next_nodes_parser, -- a parser that parses the infix operation and one node ( « + 2 » and the subsequent)
acc_result_node_creator,-- a function that takes the lists of the node generated by the previous parser and generates the final node
single_creator -- optional, a function corner case when the expression has no actual operator one element like « 1 » (one might just want to get a « number(1) » AST node instead of « sum([number(1)]) »
-- if that’s not the wanted behavior, provide a function to build a node specific for that case. Takes a node, the result of the first_node_parser .
)
return function(state)
local res
local firstNode
res = chain{
first_node_parser,
idop(
function(state)
firstNode = state.node
end
),
star(
next_nodes_parser
)
}(state)
if res then
if res.acc and #(res.acc) > 0 then
local acc = res.acc
table.insert(acc, 1, firstNode)
res.node = acc_result_node_creator(acc)
else
single_creator = single_creator or function(node) return node end
res.node = single_creator(firstNode)
end
res.acc = nil
return res
end
end
end
parser.nary_op_parser = nary_op_parser
return parser