KOI: Exercise 3: The basics of parsing, simple top-down parsers

You can do this exercise in any size group you wish. Don't worry if you find some of the questions difficult to answer, since they may be intended to initiate a discussion rather than have a single correct answer.

Preparations

Before this exercise you should have:

Question 1

This is the program parser-1. It implements a simple parser, with some additional code to make it a complete program.

a) What is the grammar that this parser implements?

b) What is the language that it defines?

(Some helpful hints: It only understands the tokens, or terminals, a, b and c. Finish the input with EOF, which in Linux is entered using CTRL-D on a separate line. All other input, such as d, is ignored. You can compile and run the program if you want, but you don't have to.)

Download

// A simple parser: parser-1.c

#include <stdio.h>
#include <ctype.h>
#include <stdlib.h>

int lookahead;

void error(void) {
    printf("Syntax error: unexpected token '%c'\n", lookahead);
    exit(EXIT_FAILURE);
}

// A simple scanner.
// Returns one of the tokens 'a', 'b' or 'c', or EOF. Other input is ignored.
int scan(void) {
    int input;
    while ((input = getchar()) != 'a' && input != 'b' && input != 'c' && input != EOF)
        ;
    return input;
}

void match(int expected) {
    if (lookahead == expected)
        lookahead = scan();
    else
        error();
}

// These are the non-terminals, calling each other recursively
void S(void), T(void), U(void);

void S(void) {
    T();
}

void T(void) {
    match('a'); match('b'); U();
}

void U(void) {
    match('c');
}

int main(void) {
    printf("Parser 1. Enter input. End with EOF (CTRL-D on Linux).\n");
    lookahead = scan(); // To get started, so we have somthing to compare to
    S();
    if (lookahead != EOF)
        error();
    printf("Done!\n");
    return EXIT_SUCCESS;
}

Question 2

This is the program parser-2. It implements a simple parser, with some additional code to make it a complete program.

a) What is the grammar that this parser implements?

b) What is the language that it defines?

Download

// A simple parser: parser-2.c

#include <stdio.h>
#include <ctype.h>
#include <stdlib.h>

int lookahead;

void error(void) {
    printf("Syntax error: unexpected token '%c'\n", lookahead);
    exit(EXIT_FAILURE);
}

// A simple scanner.
// Returns one of the tokens 'a', 'b' or 'c', or EOF. Other input is ignored.
int scan(void) {
    int input;
    while ((input = getchar()) != 'a' && input != 'b' && input != 'c' && input != EOF)
        ;
    return input;
}

void match(int expected) {
    if (lookahead == expected)
        lookahead = scan();
    else
        error();
}

// These are the non-terminals, calling each other recursively
void S(void), T(void), U(void);

void S(void) {
    T(); T(); U();
}

void T(void) {
    match('a'); U();
}

void U(void) {
    match('b'); match('c');
}

int main(void) {
    printf("Parser 2. Enter input. End with EOF (CTRL-D on Linux).\n");
    lookahead = scan(); // To get started, so we have somthing to compare to
    S();
    if (lookahead != EOF)
        error();
    printf("Done!\n");
    return EXIT_SUCCESS;
}

Question 3

This is the program parser-3. It implements a simple parser, with some additional code to make it a complete program.

a) What is the grammar that this parser implements?

b) What is the language that it defines?

Download

// A simple parser: parser-3.c

#include <stdio.h>
#include <ctype.h>
#include <stdlib.h>

int lookahead;

void error(void) {
    printf("Syntax error: unexpected token '%c'\n", lookahead);
    exit(EXIT_FAILURE);
}

// A simple scanner.
// Returns one of the tokens 'a', 'b' or 'c', or EOF. Other input is ignored.
int scan(void) {
    int input;
    while ((input = getchar()) != 'a' && input != 'b' && input != 'c' && input != EOF)
        ;
    return input;
}

void match(int expected) {
    if (lookahead == expected)
        lookahead = scan();
    else
        error();
}

// These are the non-terminals, calling each other recursively
void S(void), T(void), U(void), V(void);

void S(void) {
    if (lookahead == 'a') {
        match('a'); T();
    }
    else if (lookahead == 'b') {
        match('b'); T();
    }
    else {
        error();
    }
}

void T(void) {
    if (lookahead == 'a') {
        U(); match('c');
    }
    else if (lookahead == 'b') {
        V(); match('c');
    }
    else {
        error();
    }
}

void U(void) {
    match('a'); match('b');
}

void V(void) {
    match('b'); match('b');
}

int main(void) {
    printf("Parser 3. Enter input. End with EOF (CTRL-D on Linux).\n");
    lookahead = scan(); // To get started, so we have somthing to compare to
    S();
    if (lookahead != EOF)
        error();
    printf("Done!\n");
    return EXIT_SUCCESS;
}

Question 4

This is the program parser-4. It implements a simple parser, with some additional code to make it a complete program.

a) What is the grammar that this parser implements?

b) What is the language that it defines?

Download

// A simple parser: parser-4.c

#include <stdio.h>
#include <ctype.h>
#include <stdlib.h>

int lookahead;

void error(void) {
    printf("Syntax error: unexpected token '%c'\n", lookahead);
    exit(EXIT_FAILURE);
}

// A simple scanner.
// Returns one of the tokens 'a', 'b' or 'c', or EOF. Other input is ignored.
int scan(void) {
    int input;
    while ((input = getchar()) != 'a' && input != 'b' && input != 'c' && input != EOF)
        ;
    return input;
}

void match(int expected) {
    if (lookahead == expected)
        lookahead = scan();
    else
        error();
}

// These are the non-terminals, calling each other recursively
void S(void), T(void);

void S(void) {
    if (lookahead == 'a') {
        match('a'); T(); S();
    }
    else if (lookahead == 'b') {
        match('b');
    }
    else {
        error();
    }
}

void T(void) {
    match('a'); match('b'); match('c');
}

int main(void) {
    printf("Parser 4. Enter input. End with EOF (CTRL-D on Linux).\n");
    lookahead = scan(); // To get started, so we have somthing to compare to
    S();
    if (lookahead != EOF)
        error();
    printf("Done!\n");
    return EXIT_SUCCESS;
}

Question 5

A language consists of zero or more instances of ab, followed by ba. Some examples of valid input are ba, abba, ababba and abababba. Some examples of invalid input are ab, aa and the empty input.

a) Write a grammar for this language.

b) Write a parser that implements this grammar.

Question 6

In exercise-2 we saw this grammar for a subset of C:
statement-list -> statement statement-list | empty
statement -> variable = integer-literal ;
condition -> variable < variable
statement -> if ( condition ) statement
The terminal variable is a variable name, and consists of one or more lower-case letters. The terminal integer-literal is an integer literal, and consists of one or more normal decimal digits. Empty stands for an empty production. The start symbol is statement-list.

As an example of valid input, we saw this:

if (hjalmar < hulda) if (z < x) w = 33; kalle = 17;
a) Write some more examples of valid input.

b) Write some examples of invalid input.

c) Write a parser for the language. Verify that the inputs in a and b are handled as expected. Below is a program skeleton with a scanner and some definitions, so you only have to add the actual parsing part.

Download

// A parser for a subset of C: parser-6.c

#include <stdlib.h>
#include <stdio.h>
#include <ctype.h>
#include <string.h>

// Single-character tokens are reprseted by their character code.
// These are the other token codes.
enum token_codes {
    VARIABLE = 1000,
    INTEGER_LITERAL = 1001,
    IF = 1002
};

int lookahead;

void error(void) {
    printf("[Syntax error: unexpected token '%c' (code %d)]\n",
           (isprint(lookahead) ? lookahead : '?'), lookahead);
    exit(EXIT_FAILURE);
}

// A scanner for a subset of C.
// Returns one of the tokens '(', ')', ';', '<' and ';',
// or one of the tokens VARIABLE, INTEGER_LITERAL, IF, or EOF.
// Whitespace is ignored.
// Any other  character is an error.
int scan(void) {
    while(1) {
        int c = getchar();
        if (c == EOF)
            return EOF;
        else if (isspace(c))
            ; // Just skip whitespace
        else if (isdigit(c)) {
            // This is the start of an integer literal
            // Read it, but ignore the value
            while (isdigit(c)) {
                c = getchar();
            }
            if (c != EOF)
                ungetc(c, stdin);
            return INTEGER_LITERAL;
        }
        else if (isalpha(c)) {
            // This is the start of a keyword or variable name
            #define MAX_ID_LENGTH 100
            char buffer[MAX_ID_LENGTH + 1];
            int n = 0;
            while (isalpha(c)) {
                buffer[n++] = c;
                if (n > MAX_ID_LENGTH) {
                    printf("[Error: Identifier too long]\n");
                    exit(EXIT_FAILURE);
                }
                c = getchar();
            }
            buffer[n] = '\0';
            if (c != EOF)
                ungetc(c, stdin);
            if (strcmp(buffer, "if") == 0)
                return IF;
            else
                return VARIABLE;
        }
        else if (strchr("();<=", c) != NULL)
            return c;
        else {
            printf("[Error: Unknown character: '%c']\n", c);
            exit(EXIT_FAILURE);
        }
    } // while
} // scan

void match(int expected) {
    if (lookahead == expected)
        lookahead = scan();
    else
        error();
}

// These are the non-terminals, calling each other recursively
void statement_list(void), statement(void), condition(void);

// Write the functions statement_list, statement and condition!

int main(void) {
    printf("Parser 6. Enter input. End with EOF (CTRL-D on Linux).\n");
    lookahead = scan(); // To get started, so we have somthing to compare to
    statement_list();
    if (lookahead != EOF)
        error();
    printf("Done!\n");
    return EXIT_SUCCESS;
}

There are some solutions to some of the questions, but try to solve them yourself first.


Thomas Padron-McCarthy (thomas.padron-mccarthy@oru.se) November 5, 2024