clex is a tiny, battle-tested lexer generator for C. Feed it a list of regular expressions and it will hand back tokens one by one from an input string.
Some highlights:
- Simple C API, no code generation phase.
- Regex syntax supports grouping, alternation, character classes, ranges, and
the usual
* + ?
operators. - Whitespace between tokens is skipped automatically.
- Safe failure modes – invalid rules return
false
, and the lexer yields{.kind = -1, .lexeme = NULL}
on EOF or when no rule matches.
The maximum number of rules is 1024 by default (see CLEX_MAX_RULES
in
clex.h
).
clexLexer *clexInit(void);
void clexReset(clexLexer *lexer, const char *content);
bool clexRegisterKind(clexLexer *lexer, const char *regex, int kind);
clexToken clex(clexLexer *lexer);
void clexDeleteKinds(clexLexer *lexer);
void clexLexerDestroy(clexLexer *lexer);
Common flow:
clexInit()
to allocate a lexer.- Call
clexRegisterKind()
for each token. It returnsfalse
when passed aNULL
lexer/regex, when the regex fails to compile, or when the rule table is full – check this to catch setup issues early. clexReset()
with the source buffer (you own the lifetime of the string).- Repeatedly call
clex()
until it returns the EOF sentinel above. Each token owns itslexeme
buffer; free it when no longer needed. - Tear down with
clexDeleteKinds()
for reuse, orclexLexerDestroy()
to free everything.
A Makefile is provided for easy building and testing:
# Show available commands
make help
# Run all tests
make test-all
# Run specific tests
make test-clex # Test lexer functionality
make test-regex # Test regex patterns
make test-nfa # Generate NFA graphs
# Quick test check
make check
# Build the example from this README
make example
# Build object files for library use
make lib
# Clean build artifacts
make clean
Simply pass fa.c
, fa.h
, clex.c
, and clex.h
to your compiler along with your own application that has a main
function:
gcc your_app.c fa.c clex.c -o your_app
gcc tests.c fa.c clex.c -D TEST_CLEX && ./a.out
gcc tests.c fa.c clex.c -D TEST_REGEX && ./a.out
gcc tests.c fa.c clex.c -D TEST_NFA_DRAW && ./a.out
No output means all tests passed!
You can also run the suites individually with the provided Make targets:
make test-clex # Lexer API & integration tests
make test-regex # Regex construction & matching tests
#include "clex.h"
#include <assert.h>
#include <string.h>
typedef enum TokenKind {
INT,
OPARAN,
CPARAN,
OSQUAREBRACE,
CSQUAREBRACE,
OCURLYBRACE,
CCURLYBRACE,
COMMA,
CHAR,
STAR,
RETURN,
SEMICOL,
CONSTANT,
IDENTIFIER,
} TokenKind;
int main() {
clexLexer *lexer = clexInit();
clexRegisterKind(lexer, "int", INT);
clexRegisterKind(lexer, "\\(", OPARAN);
clexRegisterKind(lexer, "\\)", CPARAN);
clexRegisterKind(lexer, "\\[|<:", OSQUAREBRACE);
clexRegisterKind(lexer, "\\]|:>", CSQUAREBRACE);
clexRegisterKind(lexer, "{|<%", OCURLYBRACE);
clexRegisterKind(lexer, "}|%>", CCURLYBRACE);
clexRegisterKind(lexer, ",", COMMA);
clexRegisterKind(lexer, "char", CHAR);
clexRegisterKind(lexer, "\\*", STAR);
clexRegisterKind(lexer, "return", RETURN);
clexRegisterKind(lexer, "[1-9][0-9]*([uU])?([lL])?([lL])?", CONSTANT);
clexRegisterKind(lexer, ";", SEMICOL);
clexRegisterKind(lexer, "[a-zA-Z_]([a-zA-Z_]|[0-9])*", IDENTIFIER);
clexReset(lexer, "int main(int argc, char *argv[]) {\nreturn 23;\n}");
clexToken token = clex(lexer);
assert(token.kind == INT);
assert(strcmp(token.lexeme, "int") == 0);
token = clex(lexer);
assert(token.kind == IDENTIFIER);
assert(strcmp(token.lexeme, "main") == 0);
token = clex(lexer);
assert(token.kind == OPARAN);
assert(strcmp(token.lexeme, "(") == 0);
token = clex(lexer);
assert(token.kind == INT);
assert(strcmp(token.lexeme, "int") == 0);
token = clex(lexer);
assert(token.kind == IDENTIFIER);
assert(strcmp(token.lexeme, "argc") == 0);
token = clex(lexer);
assert(token.kind == COMMA);
assert(strcmp(token.lexeme, ",") == 0);
token = clex(lexer);
assert(token.kind == CHAR);
assert(strcmp(token.lexeme, "char") == 0);
token = clex(lexer);
assert(token.kind == STAR);
assert(strcmp(token.lexeme, "*") == 0);
token = clex(lexer);
assert(token.kind == IDENTIFIER);
assert(strcmp(token.lexeme, "argv") == 0);
token = clex(lexer);
assert(token.kind == OSQUAREBRACE);
assert(strcmp(token.lexeme, "[") == 0);
token = clex(lexer);
assert(token.kind == CSQUAREBRACE);
assert(strcmp(token.lexeme, "]") == 0);
token = clex(lexer);
assert(token.kind == CPARAN);
assert(strcmp(token.lexeme, ")") == 0);
token = clex(lexer);
assert(token.kind == OCURLYBRACE);
assert(strcmp(token.lexeme, "{") == 0);
token = clex(lexer);
assert(token.kind == RETURN);
assert(strcmp(token.lexeme, "return") == 0);
token = clex(lexer);
assert(token.kind == CONSTANT);
assert(strcmp(token.lexeme, "23") == 0);
token = clex(lexer);
assert(token.kind == SEMICOL);
assert(strcmp(token.lexeme, ";") == 0);
token = clex(lexer);
assert(token.kind == CCURLYBRACE);
assert(strcmp(token.lexeme, "}") == 0);
token = clex(lexer);
assert(token.kind == -1);
assert(token.lexeme == NULL);
}
NFA can be drawn with Graphviz.
#include "fa.h"
int main(int argc, char *argv) {
Node *nfa = clexNfaFromRe("[A-Z]a(bc|de)*f");
clexNfaDraw(nfa);
}
Above code will output this to stdout:
digraph G {
1 -> 0 [label="A-Z"];
0 -> 2 [label="a-a"];
2 -> 3 [label="e"];
3 -> 4 [label="e"];
4 -> 5 [label="b-b"];
5 -> 6 [label="c-c"];
6 -> 7 [label="e"];
7 -> 8 [label="e"];
8 -> 9 [label="f-f"];
7 -> 2 [label="e"];
2 -> 10 [label="e"];
10 -> 11 [label="d-d"];
11 -> 12 [label="e-e"];
12 -> 7 [label="e"];
3 -> 8 [label="e"];
}
The output can be processed with Graphviz to get the graph image:
dot -Tpng output.dot > output.png