词法分析器【编译原理】

实验内容：

基于TEST语言设计相应的词法输入器并且输出二元组

实验目的：

1、理解词法分析器的基本功能

2、理解简单的词法规则的描述方法

3、理解状态转化图及其实现

4、能够编写简单的词法分析器

实验原理：

根据DFA构造词法分析程序

1、直接编程的词法分析程序

（1）、适合词法比较简单的、手工实现、比较精简，分析速度快

（2）、与要识别的语言单词密切相关，一旦词法规则发生变化，则要重新编写程序

（3）、通过程序的控制流转移来完成对输入字符的响应，程序中的每一条语句都要与识别的单词符号有关

2、表驱动的词法分析程序

（1）、一种典型的数据与操作的分离的工作模式，控制程序不变；不同的词法分析器实质上是构造不同的分析表

（2）、为词法分析程序的自动生成提供了极大的方便

（3）、程序比较复杂，分析速度慢一些

实验内容：

1、输入：源文件字符序列s

任务：识别单词符号；滤过空格、注释等

依据：TEST语言的词法规则

输出：字符流（单词）、错误信息

2、本实验我设计的词法分析器：

（1）、能够识别出保留字、标识符、单分符、双分符、常量

（2）、利用表驱动法识别注释并且滤过注释

（3）、错误处理，能够连续查错并且能够指明错误类型

主要实现了三种错误的查找：

非法字符（@、￥等）

大小写敏感问题（保留字大小写错误例如INT a 但对于int INT不报错，只会识别为标识符)

标识符以数字开头的错误

实验代码：

#define _CRT_SECURE_NO_WARNINGS
#include<stdio.h>
#include<stdlib.h>
#include<string.h>
#define NUM 15
using namespace std;FILE* f_in;
FILE* f_out;
char f1_name[100];
char f2_name[100];
char keyWord[NUM][20] = { "default","if", "else","for","while","do","int","read","write","float","switch","case","break","function","call" };   //保留字
char token[15];				//识别的单词
char token_num;				//记录单词长度
char ch;					//单词的首个字符
int row;					//记录程序编译行数
char tmp[15];
int flag1;					//用于标志换行后可能产生错误
int flag2;					//区别标识符和保留字大小写敏感
int flag3;
int flag = 0;
char(*keyword)[20] = keyWord;
char* string;void compile();
int compile_word();
void sort(char(*a)[20]);
int binary_S(char(*a)[20], char* string);int main() {sort(keyWord);printf("请输入要编译的文件名字:");scanf("%s", f1_name);f_in = fopen(f1_name, "r");printf("请输入要将编译结果存入的文件名字:");scanf("%s", f2_name);f_out = fopen(f2_name, "w");compile();fclose(f_in);fclose(f_out);system("pause");return 0;
}//用冒泡法将保留字数组排序（根据ASCII码）
void sort(char(*a)[20]) {char tmp[20];for (int i = 0;i < NUM;i++) {for (int j = i + 1;j < NUM;j++) {if (strcmp(a[j], a[i]) < 0) {for (int k = 0;k < 20;k++) {tmp[k] = a[i][k];a[i][k] = a[j][k];a[j][k] = tmp[k];}}}}
}
//对数组进行折半查找
int binary_S(char(*a)[20], char* string) {int low = 0;int high = NUM-1;while (low <= high) {int middle = (low + high) / 2;if (strcmp(string, a[middle]) == 0) {return middle;}else if (strcmp(string, a[middle]) < 0) {high = middle - 1;}else {low = middle + 1;}}return -1;
}void bqd() {//状态2switch (ch){case '*': ch = getc(f_in);					//转到状态3//状态3s3:while (ch != '*') {ch = getc(f_in);if (ch == EOF) {printf("ERROR: the error place is in the %d row.注释错误\n", row + 1);return;}}		                                    //状态3循环switch (ch){case '*':ch = getc(f_in);				//转到状态4//状态4while (ch == '*') ch = getc(f_in);  //状态4循环switch (ch){case '/':ch = getc(f_in);		printf("注释正确\n");fprintf(f_out, "注释正确\n");return;							//状态5结束default:goto s3;					//转到状态3}default: goto end;}default:printf("单分符\t%s\n", token);			//状态6goto end;}end:return;
}
int compile_word() {//将识别的单词数组初始化for (int i = 0;i < 15;i++) {token[i] = NULL;tmp[i] = NULL;}token_num = 0;flag1 = 0;flag2 = 0;flag3 = 0;//处理空格while ((ch == ' ') || (ch == '\n')) {if (ch == '\n') {row++;flag1 = 1;}				ch = getc(f_in);}if ((ch >= 'a' && ch <= 'z') || (ch >= 'A' && ch <= 'Z')) {				//输入可能是标识符或者保留字//组成一个单词while ((ch >= 'a' && ch <= 'z') || (ch >= 'A' && ch <= 'Z') || (ch >= '0' && ch <= '9')) {token[token_num++] = ch;ch = getc(f_in);}if (flag1 == 1) {for (int j = 0;j < token_num;j++) {tmp[j] = token[j];if (token[j] >= 'A' && token[j] <= 'Z') {tmp[j] = tmp[j] + 32;								    //大写转换为小写flag2 = 1;}}}token[token_num++] = '\0';//比对保留字for (int i = 0;i < NUM;i++) {if (flag1 == 1 && flag2 == 1) {if (binary_S(keyWord, tmp)!=-1) {return -3;												//大小写敏感}}if (binary_S(keyWord, token)!=-1) {							        //匹配到某个保留字return 1;}}return 2;															//关键字ID}else if (ch >= '0' && ch <= '9') {										//输入的是常量NUM(整型)//组成一个单词while ((ch >= '0' && ch <= '9') || ch == '.') {                                 //扩展为浮点型token[token_num++] = ch;ch = getc(f_in);while((ch >= 'a' && ch <= 'z') || (ch >= 'A' && ch <= 'Z')) {ch = getc(f_in);flag3 = 1;}}if (flag3 != 1) {return 3;}                                                                   //常量else return -4;														//非法单词}else {																	//输入为纯单分符token[token_num++] = ch;switch (ch){//单分符为4  双分界符为5case '*':ch = getc(f_in);return 4;case '+':ch = getc(f_in);if (ch == '+') {token[token_num++] = ch;ch = getc(f_in);return 5;}else {return 4;}				//可以扩展为++case '-':ch = getc(f_in);return 4;					//可以扩展为--case '(':				ch = getc(f_in);return 4;case ')':ch = getc(f_in);return 4;case '{':ch = getc(f_in);return 4;case '}':ch = getc(f_in);return 4;case ',':ch = getc(f_in);return 4;case ';':ch = getc(f_in);return 4;case '"':ch = getc(f_in);return 4;case '/':ch = getc(f_in);return 4;case '>':ch = getc(f_in);//读下个字符看看是不是双分符if (ch == '=') {token[token_num++] = ch;ch = getc(f_in);return 5;}else {return 4;}case '<':ch = getc(f_in);//读下个字符看看是不是双分符if (ch == '=') {token[token_num++] = ch;ch = getc(f_in);return 5;}else {return 4;}case '!':ch = getc(f_in);//读下个字符看看是不是双分符if (ch == '=') {token[token_num++] = ch;ch = getc(f_in);return 5;}else {return 4;}case ':':ch = getc(f_in);return 4;case '=':ch = getc(f_in);//读下个字符看看是不是双分符if (ch == '=') {token[token_num++] = ch;ch = getc(f_in);return 5;}else {return 4;}case EOF:return -1;    //文件结尾符号default:			//错误没有匹配ch = getc(f_in);return -2;}}
} 
int INT;void compile() {int state;			//记录编译状态int error[100];		//记录错误行数printf("编译结果:\n");printf("类别值\t自身值\n");//读取文件第一个字符ch = getc(f_in);while (1) {if (ch != '/') {state = compile_word();if (state == -1) {break;}switch (state){case 1:printf("%s\t%s\n", token, token);fprintf(f_out, "%s\t%s\n", token, token);break;case 2: {printf("ID\t%s\n", token);fprintf(f_out, "ID\t%s\n", token);}break;case 3:printf("NUM\t%s\n", token);fprintf(f_out,"NUM\t%s\n", token);break;case 4:printf("%s\t%s\n", token, token);fprintf(f_out, "%s\t%s\n", token, token);break;case 5:printf("%s\t%s\n", token, token);fprintf(f_out, "%s\t%s\n", token, token);break;case -2:printf("ERROR: the error place is in the %d row. You have entered illegal characters\n", row + 1);fprintf(f_out, "ERROR: the error place is in the %d row. You have entered illegal characters\n", row + 1);break;case -3:printf("ERROR: the error place is in the %d row. You should enter lowercase (%s)\n", row + 1, tmp);fprintf(f_out, "ERROR: the error place is in the %d row. You should enter lowercase (%s)\n", row + 1, tmp);break;case -4:printf("ERROR: the error place is in the %d row. You cannot start a word with a number\n", row + 1);fprintf(f_out, "ERROR: the error place is in the %d row. You cannot start a word with a number\n", row + 1);break;default:break;}}else{ch = getc(f_in);bqd();}}
}

测试数据：