TransWikia.com

Lexical Analyzer In C++

Code Review Asked on January 4, 2022

So this is my first time writing a Lexer, and I want to make sure I’m doing it right. The lexer is not complete for a programming language right now, because I think I can easily add more stuff later. Here is the code:

main.cpp

#include <iostream>
#include <fstream>
#include <memory>
#include <unordered_map>
#include "lexer.h"

std::vector<std::unique_ptr<table_entry>> symbol_table;

int main(int argc, char **argv) {
  --argc;
  ++argv;
  std::string input_string;
  std::ifstream file(*argv);
  std::string next_line;
  char last_char;

  while (std::getline(file, next_line)) {
    last_char = next_line.back();
    next_line.pop_back();
    input_string.append(next_line + "n");
  }

  input_string.pop_back();
  input_string.append(std::string(1, last_char));

//  std::cout << input_string << std::endl;

  init_lexer(&input_string);

  table_entry* next_token;

  while((next_token = next())->type != token_type::end) { // Just For Testing, I'll remove this later.
    std::cout << next_token->lexme << " " << next_token->type << std::endl;
  }
}

lexer.h

#ifndef LEXER
#define LEXER

#include <utility>
#include <utility>
#include <utility>
#include <vector>
#include <list>
#include <unordered_set>
#include <sstream>
#include <string>

#define letter_pair(num) std::make_pair(num, 'a'), std::make_pair(num, 'b'), std::make_pair(num, 'c'), std::make_pair(num, 'd'), std::make_pair(num, 'e'), std::make_pair(num, 'f'), std::make_pair(num, 'g'), std::make_pair(num, 'h'), std::make_pair(num, 'i'), std::make_pair(num, 'j'), std::make_pair(num, 'k'), std::make_pair(num, 'l'), std::make_pair(num, 'm'), std::make_pair(num, 'n'), std::make_pair(num, 'o'), std::make_pair(num, 'p'), std::make_pair(num, 'q'), std::make_pair(num, 'r'), std::make_pair(num, 's'), std::make_pair(num, 't'), std::make_pair(num, 'u'), std::make_pair(num, 'v'), std::make_pair(num, 'w'), std::make_pair(num, 'x'), std::make_pair(num, 'y'), std::make_pair(num, 'z')

enum token_type {
  /// Relative Operations ///
  not_op /* ! */,
  lt /* < */,
  le /* <= */,
  eq /* == */,
  ne /* != */,
  gt /* > */,
  ge /* >= */,

  /// Assignment ///
  assign, /* = */

  /// Identifiers ///
  id /* Variable/Function Names */,
  num /* Numbers */,
  string_lit /* strings */,
  decimal /* decimal number */,

  /// Conrol Flow ///
  if_stmt /* if */,
  elif_stmt  /* elif */,
  else_stmt /* else */,
  while_stmt /* while */,
  for_stmt /* for */,

  /// Other ///
  ws /* Space, New Line, Or Tabs */,
  end /* End Of File */
};

struct table_entry {
  std::string lexme;
  token_type type;
  int line;
  int col;
  table_entry(std::string &lexme, token_type token_type, int line, int col)
      : lexme(lexme), type(token_type), line(line), col(col) {}
};

extern std::vector<std::unique_ptr<table_entry>> symbol_table;

int pos = 0, line = 1, col = 1;
std::vector<std::list<std::pair<int, char>>> state_machine;
std::unordered_set<int> accepting_states;
std::string *input;

void init_lexer(std::string *_input) {
  input = _input;

  /* Set the states of the finite state machine. You can view the output at
   * State_Machine.png, but good luck understanding it since it was drawn by
   * yours truly.
   */

  state_machine.push_back({ /* State 0 */
                              std::make_pair(1, '_'),

                              std::make_pair(2, ' '),
                              std::make_pair(2, 't'),
                              std::make_pair(2, 'n'),

                              letter_pair(3),

                              std::make_pair(4, '!'),

                              std::make_pair(6, '='),

                              std::make_pair(8, '<'),

                              std::make_pair(10, '>')
                          });

  state_machine.push_back({ /* State 1 */
                              std::make_pair(1, '_'),

                              letter_pair(3)
                          });

  state_machine.push_back({ /* State 2 */
                              std::make_pair(2, 'n'),
                              std::make_pair(2, ' '),
                              std::make_pair(2, 't')
                          });

  state_machine.push_back({ /* State 3 */
                              std::make_pair(3, '_'),
                              letter_pair(3)
                          });

  state_machine.push_back({ /* State 4 */
                              std::make_pair(5, '=')
                          });

  state_machine.emplace_back( /* State 5 */

  );

  state_machine.push_back({ /* State 6 */
                              std::make_pair(7, '=')
                          });

  state_machine.emplace_back( /* State 7 */

  );

  state_machine.push_back({ /* State 8 */
                              std::make_pair(9, '=')
                          });

  state_machine.emplace_back( /* State 9 */

  );

  state_machine.push_back({ /* State 10 */
                              std::make_pair(11, '=')
                          });

  state_machine.push_back({

                          });

  accepting_states.insert(2);
  accepting_states.insert(3);
  accepting_states.insert(4);
  accepting_states.insert(5);
  accepting_states.insert(6);
  accepting_states.insert(7);
  accepting_states.insert(8);
  accepting_states.insert(9);
  accepting_states.insert(10);
  accepting_states.insert(11);
}

table_entry *generate_entry(int state, std::string &lexme) {
  switch (state) {
    case 2: return new table_entry(lexme, token_type::ws, line, col);
    case 3:
      if (lexme == "if") {
        return new table_entry(lexme, token_type::if_stmt, line, col);
      } else if (lexme == "elif") {
        return new table_entry(lexme, token_type::elif_stmt, line, col);
      } else if (lexme == "else") {
        return new table_entry(lexme, token_type::else_stmt, line, col);
      } else if (lexme == "for") {
        return new table_entry(lexme, token_type::for_stmt, line, col);
      } else if (lexme == "while") {
        return new table_entry(lexme, token_type::while_stmt, line, col);
      } else {
        return new table_entry(lexme, token_type::id, line, col);
      }
    case 4:return new table_entry(lexme, token_type::not_op, line, col);
    case 5:return new table_entry(lexme, token_type::ne, line, col);
    case 6:return new table_entry(lexme, token_type::assign, line, col);
    case 7:return new table_entry(lexme, token_type::eq, line, col);
    case 8:return new table_entry(lexme, token_type::lt, line, col);
    case 9:return new table_entry(lexme, token_type::le, line, col);
    case 10:return new table_entry(lexme, token_type::gt, line, col);
    case 11:return new table_entry(lexme, token_type::ge, line, col);
    default:return nullptr;
  }
}

table_entry *next() {
  if (pos >= input->length()) {
    return new table_entry(*(new std::string()), token_type::end, line, col);
  }

  int state = 0;

  int _pos = pos; // Copy the positions to save the initial locations of the tokens
  int _line = line;
  int _col = col;
  std::string next_lexme;

  while (true) {
    if (_pos >= input->length()) {
      break;
    }

    char next = (*input)[_pos];

    std::list<std::pair<int, char>> &neighbors = state_machine[state];
    bool didFind = false;

    for (auto &neighbor : neighbors) {
      if (neighbor.second == next) {
        state = neighbor.first;
        didFind = true;
        break;
      }
    }

    if (!didFind) {
      break;
    }

    if (next == 'n') {
      _col = 1;
      _line++;
    } else {
      _col++;
    }

    _pos++;

    next_lexme.push_back(next);
  }

  if (accepting_states.contains(state)) {
    table_entry *ans = generate_entry(state, next_lexme);

    if (ans->type != token_type::id) {
      symbol_table.emplace_back(ans);
    }

    pos = _pos;
    line = _line;
    col = _col;
    return ans;
  } else {
    std::cerr << "Unexpected Token: " << next_lexme << " At Line " << line << " And Column " << col;
    return new table_entry(*(new std::string()), token_type::end, line, col);
  }
}

#endif // LEXER

A couple of things:

  1. My source is the Dragon Book, 1st edition.
  2. My finite automata is probably not conventional because it is not a standard DFA. The way I’m using it right now is I have a NFA but I keep track of the string going through it. Once I reach the end, I call another method to determine the correct token depending on the string and the state.
  3. I know there are better input buffering techniques that I use. What I’m doing right now is just stuffing every line into a string. I’m probably going to make it better later.
  4. Incase it helps you visualize it better, here is a drawing of the state machine (sorry for the bad drawing):

enter image description here

Add your own answers!

Ask a Question

Get help from others!

© 2024 TransWikia.com. All rights reserved. Sites we Love: PCI Database, UKBizDB, Menu Kuliner, Sharing RPP