Parser identifying tokens in the wrong way (Flex and Bison)

2

I am implementing a compiler based on the Tiger language proposed by Andrew Appel's book "Modern Compiler Implementation in C". Link: link

I'm using the available files on the book's website, but when I run the parser, the tokens reported in the error message are wrong. For example, using this input file:

    let 
        type rectype = {name:string, id:int}

        var a:= rectype nil
    in
        a
    end

I get the following error:

testes/teste1:1.1: syntax error, unexpected MAIOR_IGUAL
Parsing failed

However the token it should show would be "LET". I have already tested with the lexer and the tokens shown by it are correct:

Token:        LET | Posicao:    1 | Valor:    -
Token:       TYPE | Posicao:    6 | Valor:    -
Token:         ID | Posicao:   11 | Valor: rectype 
Token:      IGUAL | Posicao:   19 | Valor:    -
Token:     ACHAVE | Posicao:   21 | Valor:    -
Token:         ID | Posicao:   22 | Valor: name
....

I've come to the conclusion that I'm doing something in the compilation, or else, my tiger.grm file (the bison input file) has something wrong, though, I do not know what. I compile the lexer and use the file "lex.yy.c" which is generated to compile with the parser afterwards. Lexer Compilation:

lextest: driver.o lex.yy.o errormsg.o util.o
    cc -g -o lextest driver.o lex.yy.o errormsg.o util.o

driver.o: driver.c tokens.h errormsg.h util.h
    cc -g -c driver.c

errormsg.o: errormsg.c errormsg.h util.h
    cc -g -c errormsg.c

lex.yy.o: lex.yy.c tokens.h errormsg.h util.h
    cc -g -c lex.yy.c

lex.yy.c: tiger.l
    lex tiger.l

util.o: util.c util.h
    cc -g -c util.c

clean: 
    rm -f a.out util.o driver.o lex.yy.o lex.yy.c errormsg.o

Compilation of the parser (using lex.yy.c generated earlier):

a.out: parsetest.o y.tab.o lex.yy.o errormsg.o util.o
cc -g parsetest.o y.tab.o lex.yy.o errormsg.o util.o

parsetest.o: parsetest.c errormsg.h util.h
    cc -g -c parsetest.c

y.tab.o: y.tab.c
    cc -g -c y.tab.c

y.tab.c: tiger.grm
    yacc -dv tiger.grm

y.tab.h: y.tab.c
    echo "y.tab.h was created at the same time as y.tab.c"

errormsg.o: errormsg.c errormsg.h util.h
    cc -g -c errormsg.c

lex.yy.o: lex.yy.c y.tab.h errormsg.h util.h
    cc -g -c lex.yy.c

#lex.yy.c: tiger.lex
#   lex tiger.lex

util.o: util.c util.h
    cc -g -c util.c

clean: 
    rm -f a.out util.o parsetest.o lex.yy.o errormsg.o y.tab.c y.tab.h y.tab.o

The lexer and parser are (respectively):

%{
#include <string.h>
#include "util.h"
#include "tokens.h"
#include "errormsg.h"

int charPos=1;

int yywrap(void){
    charPos=1;
    return 1;
}

void adjust(void){
    EM_tokPos=charPos;
    charPos+=yyleng;
}

%}

digit       [0-9]
letter      [a-zA-Z]
integer     {digit}+
identifier  {letter}("_"|{letter}|{digit})*
%x C_COMMENT

%%

{integer}       {adjust(); yylval.ival = atoi(yytext);   return INT;}

"/*"                        {BEGIN(C_COMMENT);}
<C_COMMENT>"*/"             {BEGIN(INITIAL);}
<C_COMMENT>.                { }

<INITIAL>"to"                       {adjust(); return TO;}
<INITIAL>"break"                        {adjust(); return BREAK;}
<INITIAL>"let"                      {adjust(); return LET;}
<INITIAL>"in"                       {adjust(); return IN;}
<INITIAL>"end"                      {adjust(); return END;}
<INITIAL>"function"                     {adjust(); return FUNCTION;}
<INITIAL>"var"                      {adjust(); return VAR;}
<INITIAL>"type"                         {adjust(); return TYPE;}
<INITIAL>"array"                    {adjust(); return ARRAY;}
<INITIAL>"if"                       {adjust(); return IF;}
<INITIAL>"then"                         {adjust(); return THEN;}
<INITIAL>"else"                         {adjust(); return ELSE;}
<INITIAL>"do"                       {adjust(); return DO;}
<INITIAL>"of"                       {adjust(); return OF;}
<INITIAL>"nil"                      {adjust(); return NIL;}
<INITIAL>"string"                   {adjust(); return STRING;}

<INITIAL>","                        {adjust(); return VIRGULA;}
<INITIAL>":"                        {adjust(); return DOIS_PONTOS;}
<INITIAL>";"                        {adjust(); return PTO_VIRGULA;}
<INITIAL>"("                        {adjust(); return APARENT;}
<INITIAL>")"                        {adjust(); return FPARENT;}
<INITIAL>"["                            {adjust(); return ACOLCH;}
<INITIAL>"]"                        {adjust(); return FCOLCH;}
<INITIAL>"{"                            {adjust(); return ACHAVE;}
<INITIAL>"}"                        {adjust(); return FCHAVE;}
<INITIAL>"."                        {adjust(); return PONTO;}
<INITIAL>"+"                        {adjust(); return SOMA;}
<INITIAL>"-"                        {adjust(); return SUB;}
<INITIAL>"*"                        {adjust(); return MUL;}
<INITIAL>"/"                        {adjust(); return DIV;}
<INITIAL>"="                        {adjust(); return IGUAL;}
<INITIAL>"<>"                       {adjust(); return DIF;}
<INITIAL>"<"                        {adjust(); return MENOR;}
<INITIAL>"<="                       {adjust(); return MENOR_IGUAL;}
<INITIAL>">"                        {adjust(); return MAIOR;}
<INITIAL>">="                       {adjust(); return MAIOR_IGUAL;}
<INITIAL>"&"                        {adjust(); return AND;}
<INITIAL>"|"                        {adjust(); return OR;}
<INITIAL>":="                       {adjust(); return ATRIBUICAO;}

<INITIAL>{identifier}               {adjust(); yylval.sval = String(yytext); return ID;}
<INITIAL>[0-9][a-z]*                    {adjust(); EM_error(EM_tokPos, "Erro: token ilegal.");}

<INITIAL>[ \t]+                     {adjust(); continue;}
<INITIAL>\n                         {adjust(); EM_newline();}
<INITIAL>[a-z]"."                       {adjust(); EM_error(EM_tokPos, "Erro: token ilegal.");}
<INITIAL>.                          {adjust(); EM_error(EM_tokPos, "Unknown character");}

%%

Disregarding grammar, I can not test it with this error, so it probably must be all wrong. I already tested with a simple good and I saw that it does not influence the error.

%{
#include <stdio.h>
#include "util.h"
#include "errormsg.h"

int yylex(void); /* function prototype */

void yyerror(char *s){
    EM_error(EM_tokPos, "%s", s);
}

%}

%union {
    int pos;
    int ival;
    string sval;
}

%token <sval> ID STRING
%token <ival> INT

%error-verbose

%token  TO BREAK LET IN END FUNCTION VAR
%token  TYPE ARRAY IF THEN ELSE DO OF NIL
%token  VIRGULA DOIS_PONTOS PTO_VIRGULA
%token  APARENT FPARENT ACHAVE FCHAVE 
%token  ACOLCH FCOLCH PONTO SOMA SUB MUL
%token  DIV IGUAL DIF MENOR MENOR_IGUAL
%token  MAIOR MAIOR_IGUAL AND OR ATRIBUICAO

%left SOMA SUB
%left DIV MUL
%left USUB

%start program

%%

program:    LET comando IN exp END
            |exp

comando:    comando                         {}
            |exp                            {}
            |ifthenelse                     {}
            |atrib                          {}
            |funcao                         {}
            |record                         {}  

exp:        exp AND exp                     {}
            |exp OR exp                     {}
            |exp1                           {}

exp1:       exp2 MAIOR exp2                 {}
            |exp2 MAIOR_IGUAL exp2          {}
            |exp2 MENOR exp2                {}
            |exp2 MENOR_IGUAL exp2          {}
            |exp2 IGUAL exp2                {}
            |exp2 DIF exp2                  {}
            |exp2                           {}

exp2:       exp2 SOMA exp2                  {}
            |exp2 SUB exp2                  {}
            |SUB exp2 %prec USUB            {}
            |exp2 MUL exp2                  {}
            |exp2 DIV exp2                  {}
            |APARENT exp2 FPARENT           {}
            |INT                            {}
            |ID                             {}

ifthenelse: IF exp THEN comando ELSE comando        {}
            |IF exp THEN comando                    {}

atrib:      VAR ID ATRIBUICAO exp                   {}
            |VAR ID DOIS_PONTOS ID ATRIBUICAO exp   {}
            |VAR ID ATRIBUICAO NIL                  {}

record:     TYPE ID IGUAL STRING                    {}
            |TYPE ID IGUAL INT                      {}
            |TYPE ID IGUAL ACHAVE arg FCHAVE        {}

arg:        ID DOIS_PONTOS ID VIRGULA arg           {}
            |ID DOIS_PONTOS ID                      {}

funcao:     FUNCTION ID APARENT arg FPARENT DOIS_PONTOS ID IGUAL program {}
    
asked by anonymous 17.02.2016 / 21:05

1 answer

2

No Lexer

There are 2 problems:

Here, the token INT returns an integer, but in the parser, it expects a type int :

integer     {digit}+
...
{integer}       {adjust(); yylval.ival = atoi(yytext);   return INT;}

And here, it returns a token STRING :

<INITIAL>"string" {adjust(); return STRING;}

However, in the parser, the token STRING is declared as a string sval; value and not as a data type.

The solution to the lexer is to create two more tokens that represent types, for example:

<INITIAL>"string"                   { adjust(); return TSTRING;}
<INITIAL>"int"                      { adjust(); return TINT;}

And add the following lines in the parser :

%token TSTRING
%token TINT


No parser

The grammar really is incorrect and this is what is causing the error:

  

tests / test1: 1.1: syntax error, unexpected MAIOR_IGUAL    Parsing failed

This error is generated by the parser that is probably traversing a path invalid when interpreting grammar.

The best reference about these tools is the manuals:

Lexical Analysis With Flex

Bison 3.0.4

Below is a lexer / parser ( well-simplified ) version of the lexer / parser that can recognize the example sample file:

Lexer:

 %{
#include <string.h>
#include "t.tab.h"

YYSTYPE yylval;
char err_buffer[512];

// Coloquei esta função para analisar a saída do lexer
void deb(const char *msg)
{
    // para visualizar os tokens, basta retirar o comentário abaixo
    // printf("Token: %s\n", msg);
}

%}

%option noyywrap

digit       [0-9]
letter      [a-zA-Z]
integer     {digit}+
identifier  {letter}("_"|{letter}|{digit})*
%x C_COMMENT

%%
"/*"                              { BEGIN(C_COMMENT); }
<C_COMMENT>"*/"                   { BEGIN(INITIAL); }
<C_COMMENT>.                      { deb("CM "); }
<INITIAL>"to"                     { deb("To"); return TO; }
<INITIAL>"break"                  { deb("Break"); return BREAK;}
<INITIAL>"let"                    { deb("Let"); return LET; }
<INITIAL>"in"                     { deb("In"); return IN;}
<INITIAL>"end"                    { deb("End"); return END;}
<INITIAL>"function"               { deb("Function"); return FUNCTION;}
<INITIAL>"var"                    { deb("Var"); return VAR;}
<INITIAL>"type"                   { deb("Type"); return TYPE; }
<INITIAL>"array"                  { deb("Array"); return ARRAY;}
<INITIAL>"if"                     { deb("If"); return IF;}
<INITIAL>"then"                   { deb("Then"); return THEN;}
<INITIAL>"else"                   { deb("Else"); return ELSE;}
<INITIAL>"do"                     { deb("Do"); return DO;}
<INITIAL>"of"                     { deb("Of"); return OF;}
<INITIAL>"nil"                    { deb("Nil"); return NIL;}
<INITIAL>"string"                 { deb("TString"); return TSTRING;}
<INITIAL>"int"                    { deb("TString"); return TINT;}

<INITIAL>","                      { deb("Vg"); return VIRGULA;}
<INITIAL>";"                      { deb("Pv"); return PTO_VIRGULA;}
<INITIAL>"("                      { deb("Ap"); return APARENT;}
<INITIAL>")"                      { deb("Fp"); return FPARENT;}
<INITIAL>"["                      { deb("Ac"); return ACOLCH;}
<INITIAL>"]"                      { deb("Fc"); return FCOLCH;}
<INITIAL>"{"                      { deb("Ach"); return ACHAVE;}
<INITIAL>"}"                      { deb("Fch"); return FCHAVE;}
<INITIAL>"."                      { deb("Pt"); return PONTO;}
<INITIAL>"+"                      { deb("Sm"); return SOMA;}
<INITIAL>"-"                      { deb("Sub"); return SUB;}
<INITIAL>"*"                      { deb("Mul"); return MUL;}
<INITIAL>"/"                      { deb("Div"); return DIV;}
<INITIAL>"="                      { deb("Eq"); return IGUAL;}
<INITIAL>"<>"                     { deb("Dif"); return DIF;}
<INITIAL>"<"                      { deb("Men"); return MENOR;}
<INITIAL>"<="                     { deb("Me="); return MENOR_IGUAL;}
<INITIAL>">"                      { deb("Mai"); return MAIOR;}
<INITIAL>">="                     { deb("Ma="); return MAIOR_IGUAL;}
<INITIAL>"&"                      { deb("An"); return AND;}
<INITIAL>"|"                      { deb("Or"); return OR;}
<INITIAL>":="                     { deb("Atrb"); return ATRIBUICAO;}
<INITIAL>":"                      { deb("Dp"); return DOIS_PONTOS;}

<INITIAL>{identifier}             {
                                    yylval.sval = strdup(yytext);
                                    sprintf(err_buffer, "ID[%s]\n", yylval.sval);
                                    deb(err_buffer);
                                    return ID;
                                  }

<INITIAL>[0-9][a-z]*              { printf("Erro: token ilegal[0-z].\n"); }
<INITIAL>[ \t]+                   { continue;}
<INITIAL>\n                       { deb("NL"); }
<INITIAL>[a-z]"."                 { printf("Erro: token ilegal: %s\n", yytext); }
<INITIAL>.                        { printf("Unknown character\n"); }

%%

/*

 Para executar somente o lexer, basta retirar o
 comentário abaixo 'main' e executar via pipe, exemplo:

 lexer.executavel < nome_do_arquivo.txt

 Quando for compilar novamente com o parser, comentar novamente.

*/
/*
int main()
{
    int val;
    int count = 0;

    while (val = yylex());
    printf("final.\n");
}
*/

After execution, the result is:

Token: CM
Token: CM
Token: CM
Token: CM
Token: CM
Token: CM
Token: CM
Token: CM
Token: CM
Token: CM
Token: CM


Token: NL
Token: Let
Token: NL
Token: Type
Token: ID[rectype]

Token: Eq
Token: Ach
Token: ID[name]

Token: Dp
Token: TString
Token: Vg
Token: ID[id]

Token: Dp
Token: TString
Token: Fch
Token: NL
Token: NL
Token: Var
Token: ID[a]

Token: Atrb
Token: ID[rectype]

Token: Nil
Token: NL
Token: In
Token: NL
Token: ID[a]

Token: NL
Token: End
final.

tested with: flex 2.5.35 and gcc version 5.1.0 (tdm64-1) [Windows 8.1]

Parser:

One tip: to define the grammar (according to the manual) is that the lines that have recursive calls must have the first line blank. Ex:

comando:    /*  esta linha está em branco */
            |comando record                       { deb2("Comando record\n"); }


%{
#include <stdio.h>
#include <stdarg.h>

int yylex(void); /* function prototype */

void yyerror(char *s){
    printf("%s", s);
}

// Coloquei esta função para analisar a saída do parser
void deb2(const char *fmt, ...)
{
    va_list arg;

    va_start(arg, fmt);
    vfprintf(stderr, fmt, arg);
    va_end(arg);
}

%}

%union {
    int pos;
    int ival;
    char *sval;
}

%token <sval> ID
%token <sval> STRING
%token <ival> INT

%error-verbose
%token TSTRING
%token TINT
%token  TO BREAK LET IN END FUNCTION VAR
%token  TYPE ARRAY IF THEN ELSE DO OF NIL
%token  VIRGULA DOIS_PONTOS PTO_VIRGULA
%token  APARENT FPARENT ACHAVE FCHAVE
%token  ACOLCH FCOLCH PONTO SOMA SUB MUL
%token  DIV IGUAL DIF MENOR MENOR_IGUAL
%token  MAIOR MAIOR_IGUAL AND OR ATRIBUICAO

%left SOMA SUB
%left DIV MUL
%left USUB

%start program

%%

program:    LET comando IN ID END                 { deb2("Let ... ID[%s]\n", $4); }
            ;

comando:
            |comando record                       { deb2("Comando record\n"); }
            |comando atrib                        { deb2("Comando attrib\n"); }
            ;

atrib:      VAR ID ATRIBUICAO ID NIL              { deb2("Var %s := NIL\n", $2); }
            ;

record:     TYPE ID IGUAL ACHAVE arg FCHAVE       { deb2("Tipo {}\n"); }
            |TYPE ID IGUAL TSTRING                { deb2("Tipo %s : String"); }
            ;

arg:
            ID DOIS_PONTOS TSTRING VIRGULA arg    { deb2("ID[%s] : STRING,\n", $1); }
            |ID DOIS_PONTOS TINT VIRGULA arg      { deb2("ID[%s] : INT, ", $1);  }
            |ID DOIS_PONTOS TSTRING               { deb2("ID[%s] : STRING\n", $1);  }
            |ID DOIS_PONTOS TINT                  { deb2("ID[%s] : INT\n", $1);  }
            ;

%%

int main()
{
    yyparse();
}

After execution:

ID[id] : INT
ID[name] : STRING,
Tipo {}
Comando record
Var a := NIL
Comando attrib
Let ... ID[a]

tested with: bison (GNU Bison) 2.4.2 and gcc version 5.1.0 (tdm64-1) [Windows 8.1]     

18.02.2016 / 06:23