User:Haus/Hanzo/lexer
Appearance
Here's a (somewhat dated) version of the guts of a flex specification for parsing infoboxes:
...
%%
%{
private int comment_count = 0;
private String name;
private String value;
%}
%line
%char
%caseless
%unicode
%standalone
//%debug
%state SHIPBOX
%state NAME
%state VALUE
ALPHA=[A-Za-z]
DIGIT=[0-9]
NONNEWLINE_WHITE_SPACE_CHAR=[\ \t\b\012]
WHITE_SPACE_CHAR=[\n\ \t\b\012]
STRING_TEXT=(\\\"|[^\n\"]|\\{WHITE_SPACE_CHAR}+\\)*
LineTerminator = \r|\n|\r\n
InputCharacter = [^\r\n]
WhiteSpace = {LineTerminator} | [ \t\f]
%%
<YYINITIAL> {
"{{Infobox Ship"[|]*{WhiteSpace} |
"{{Ship table"[|]*{WhiteSpace} {
yybegin(SHIPBOX);
comment_count +=1;
return (1);
}
[^\n]*[\n]* {
//printlns replaced to preserve UTF-8
System.out.println(yytext());
return (100);
}
}
<SHIPBOX> {
"{{" { comment_count = comment_count + 1; }
[\|]*"}}" {
comment_count = comment_count - 1;
Utility.Assert(comment_count >= 0);
if (comment_count == 0) {
ShipBox.printnv(name,value);
ShipBox.printbox();
yybegin(YYINITIAL);
}
}
\| {
if(name!=null && value!=null){
ShipBox.printnv(name,value);
}
yybegin(NAME);
}
[^\|] { value += yytext();}
}
<NAME> {
[^=]*"=" {
name = new String(yytext());
yybegin(VALUE);
}
}
<VALUE> {
[^\n\r]*[\n\r]+ {
value = new String(yytext());
yybegin(SHIPBOX);
}
}