-
Notifications
You must be signed in to change notification settings - Fork 0
/
lexer.c
369 lines (318 loc) · 10.5 KB
/
lexer.c
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
// Homeworktwo: Lexer
// Ashton Ansag
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
// enum with token values for the lexemes
typedef enum{
nulsym = 1, identsym, numbersym, plussym, minussym,
multsym, slashsym, oddsym, eqlsym, neqsym,
lessym, leqsym, gtrsym, geqsym, lparentsym,
rparentsym, commasym, semicolonsym, periodsym, becomessym,
beginsym, endsym, ifsym, thensym, whilesym,
dosym, callsym, constsym, varsym, procsym,
writesym, readsym, elsesym, returnsym
}token;
/* I decided instead of making arrays of digits/letter/symbols to test the
ASCII value to determine what the char is. */
// ASCII Value Range for Digits
#define DIGIT_MIN 48
#define DIGIT_MAX 57
// ASCII Value Range for Uppercase Letters
#define ULETTER_MIN 65
#define ULETTER_MAX 90
// ASCII Value Range for Lowercase Letters
#define LLETTER_MIN 97
#define LLETTER_MAX 122
// ASCII Values for Invisible Characters
#define TAB 9
#define WHITE_SPACE 32
#define NEWLINE 10
#define CARRIAGE_RETURN 13
// ASCII Values for Special Symbols
// THIS WILL NOT INCLUDE DIGITS
#define SYM_MIN 40
#define SYM_MAX 62
// just made a global counter
int temp;
// global reading buffer
char *buffer;
// global length of the buffer
int bufferLen = 0;
// Array to store the reserved words for PL/0
char resword[15][12] = {
"const","var","procedure","call","begin","end","if","then",
"else","while","do","read","write","odd","return"
};
// token values for the reserved words
int restoken[16] = {
constsym, varsym, procsym, callsym, beginsym, endsym, ifsym, thensym,
elsesym, whilesym, dosym, readsym, writesym, oddsym, returnsym
};
// a lex table has a lexeme and a token
typedef struct{
char *lexeme;
int token;
}lex;
// resizeable lexTable
lex *lexTable;
// TableSize for the lexTable
int ts = 0;
// Error messages for the list that are to be detected
char lexError[4][22] = {
"Var started with num\n", "Number too long\n", "Name too long\n", "Invalid symbol\n"
};
// This method checks if the char is a digit by referencing the ASCII Values
int isDigit( char test ){
// get the integer(ASCII) value of the char and test if it is in digit bounds
return( (int)test <= DIGIT_MAX && (int)test >= DIGIT_MIN );
}
// This method checks if the char is a letter by referencing the ASCII Values
int isLetter( char test ){
// get the integer(ASCII) value of the char and test if it is in letter bounds
if( (int)test <= ULETTER_MAX && (int)test >= ULETTER_MIN ) return 1;
if( (int)test <= LLETTER_MAX && (int)test >= LLETTER_MIN ) return 1;
return 0;
}
// This method checks if the char is a symbol by referencing the ASCII Values
int isSymbol( char test ){
// get the integer(ASCII) value of the char and test if it is in symbol bounds
if( (int)test <= SYM_MAX && (int)test >= SYM_MIN ) return 1;
return 0;
}
// This method checks if the char is an invisible character with ASCII Values
int isInvis( char test ){
if( (int)test == TAB ) return 1;
if( (int)test == WHITE_SPACE ) return 1;
if( (int)test == NEWLINE || (int)test == CARRIAGE_RETURN ) return 1;
return 0;
}
// This method will add items to the lexemetable array and return if added
int addToTable( FILE *clean, char *str, int token ){
// reallocate space for a new item in the lex array
lexTable = realloc( lexTable, ++ts * sizeof(lex) );
// malloc and store the lexeme string
lexTable[ts -1].lexeme = (char*)malloc( 12 );
strcpy( lexTable[ts -1].lexeme, str );
// make sure variable is treated as a variable
if( token == returnsym ) token = identsym;
// store the token
lexTable[ts -1].token = token;
// add string to cleaninput.txt
fprintf( clean, "%s", str );
return 1;
}
// This method checks through all the reserved list for the str
int isReserved( FILE *clean, char *str ){
for( temp = 0; temp < 15; temp++ )
// if the str is found add it to the lexTable and cleaninput.txt
if( !strcmp( resword[temp], str ) )
return addToTable( clean, resword[temp], restoken[temp] );
return 0;
}
// prints error message and cleans the rest of the file of comments
int error( FILE *input, FILE *clean, char c, char *buffer, int i ){
printf( "%s", lexError[i] );
fprintf( clean, "%s", buffer );
while( c != EOF ){
switch(c){
case '/': // still clean input of comments
c = getc( input );
switch(c){
case '*': // '/*' multiline comment
do{ // waits for another '/' after a '*'
do{ // waits for another '*'
c = getc( input );
}while( c != '*' && c != EOF );
c = getc( input );
}while( c != '/' && c != EOF );
break;
case '/': // '//' singleline comment
do{ // waits for a newline character
c = getc( input );
}while( c != NEWLINE && c != EOF );
break;
default: // '/' as in 'divides' add this one to the table
addToTable( clean, "/", slashsym );
ungetc( c, input );
}
break;
default: // not a comment so just print away
fprintf( clean, "%c", c );
}
c = getc( input );
}
return 0;
}
// This method will read the input file char by char
// and will clean the input and build the lexTable
int execute( ){
// open input.txt and cleaninput.txt
FILE *input = fopen( "input.txt","r" );
FILE *clean = fopen( "cleaninput.txt","w+" );
// begin grabbing char by char
char c = getc( input );
buffer = (char *)malloc(13);
// while there are characters left parse the file
while( c != EOF ){
// makes sure the buffer is clean
for( temp = 0; temp < 13; temp++ )
buffer[temp] = '\0';
// start building a word
buffer[bufferLen] = c;
if( isLetter(c) ){ // word starts with a letter
// keep adding letters or digits to the word
while( isLetter(c) || isDigit(c) ){
buffer[bufferLen++] = c;
// if the buffer word is too large give error
if( bufferLen > 11 ) return error( input, clean, c, buffer, 2 );
c = getc( input );
}
// now check if the found word is a reserved word
if( !isReserved( clean, buffer ) )
// add to lexTable as an identsym
addToTable( clean, buffer, identsym );
// unget one char
ungetc( c, input );
}else if( isDigit(c) ){ // word start with a digit
// keep adding digits to the word (number)
while( isDigit(c) ){
buffer[bufferLen++] = c;
// if the buffer word (number) is no large give error
if( bufferLen > 5 ) return error( input, clean, c, buffer, 1 );
c = getc( input );
// if the word starts with a digit and has a letter give error
if( isLetter(c) ) return error( input, clean, c, buffer, 0 );
}
// add to lexTable as an numbersym
addToTable( clean, buffer, numbersym );
// unget one char and add buffer to cleaninput.txt
ungetc( c, input );
}else if( isSymbol(c) ){ // word is a symbol
// use a switch to add the corresponding symbol to the lexTable
// also add the symbol to cleaninput.txt
switch(c){
case '+':
addToTable( clean, "+", plussym );
break;
case '-':
addToTable( clean, "-", minussym );
break;
case '*':
addToTable( clean, "*", multsym );
break;
case '/': // two comment formatts that are left out of cleaninput.txt
c = getc( input );
switch(c){
case '*': // '/*' multiline comment
do{ // waits for another '/' after a '*'
do{ // waits for another '*'
c = getc( input );
}while( c != '*' );
c = getc( input );
}while( c != '/' );
break;
case '/': // '//' singleline comment
do{ // waits for a newline character
c = getc( input );
}while( c != NEWLINE );
break;
default: // '/' as in 'divides' add this one to the table
addToTable( clean, "/", slashsym );
ungetc( c, input );
}
break;
case '(':
addToTable( clean, "(", lparentsym );
break;
case ')':
addToTable( clean, ")", rparentsym );
break;
case '=':
addToTable( clean, "=", eqlsym );
break;
case ',':
addToTable( clean, ",", commasym );
break;
case '.':
addToTable( clean, ".", periodsym );
break;
case '<':
c = buffer[++bufferLen] = getc( input );
switch( c ){
case '=': // less than or equal to
addToTable( clean, "<=", leqsym );
break;
case '>': // '<>' neqsym
addToTable( clean, "<>", neqsym );
break;
default: // just less than
addToTable( clean, "<", lessym );
}
// ungetc( c, input );
break;
case '>':
c = buffer[++bufferLen] = getc( input );
switch( c ){
case '=': // greater than or equal to
addToTable( clean, ">=", leqsym );
break;
default: // just greater than
addToTable( clean, ">", lessym );
}
ungetc( c, input );
break;
case ';':
addToTable( clean, ";", semicolonsym );
break;
case ':': // a colon is found check for an '='
// if not bleed into the default case to send an error
buffer[++bufferLen] = getc( input );
if( !strncmp( buffer, ":=", 2 ) ){
addToTable( clean, ":=", becomessym );
break;
}
default:
return error( input, clean, c, buffer, 3 );
}
}else if( isInvis(c) ){ // word is a white space
// add the char to the buffer
while( isInvis(c) ){
buffer[0] = c;
c = getc( input );
}
ungetc( c, input );
fprintf( clean, "%s", buffer );
}else
return error( input, clean, c, buffer, 3 );
// reset bufferLen and continue building char by char
bufferLen = 0;
c = getc( input );
}
fclose( input );
fclose( clean );
return 1;
}
// This method uses the lexTable to fill in the lexeme* files
void lexOutput(){
FILE *tableP = fopen( "lexemetable.txt","w+" );
FILE *listP = fopen( "lexemelist.txt","w+" );
int token;
fprintf( tableP, "lexeme \ttoken type\n" );
// go through the whole table adding all items
for( temp = 0; temp < ts; temp++ ){
token = lexTable[temp].token;
fprintf( tableP, "%s\t%i\n", lexTable[temp].lexeme, token );
fprintf( listP, "%i ", token );
// if the token is an identifier or number add the lexeme to the file
if( token == identsym || token == numbersym )
fprintf( listP, "%s ", lexTable[temp].lexeme );
}
fclose( tableP );
fclose( listP );
}
int main(){
execute();
lexOutput();
return 0;
}