summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorCarl Worth <cworth@cworth.org>2010-05-25 15:04:32 -0700
committerCarl Worth <cworth@cworth.org>2010-05-25 15:04:32 -0700
commit9fb8b7a495c9dc6f9a62cf82300fae5925af92fc (patch)
tree622648ad7c965b94608a2f4d6d05dfb9963e348f
parent808401fd79eea9fa2c965f9f235a753c0cb0d920 (diff)
Make the lexer pass whitespace through (as OTHER tokens) for text lines.
With this change, we can recreate the original text-line input exactly. Previously we were inserting a space between every pair of tokens so our output had a lot more whitespace than our input. With this change, we can drop the "-b" option to diff and match the input exactly.
-rw-r--r--glcpp-lex.l72
-rw-r--r--glcpp-parse.y2
-rwxr-xr-xtests/glcpp-test2
3 files changed, 59 insertions, 17 deletions
diff --git a/glcpp-lex.l b/glcpp-lex.l
index f1dd11ea9b..7b5cdd57a0 100644
--- a/glcpp-lex.l
+++ b/glcpp-lex.l
@@ -32,6 +32,21 @@
%option reentrant noyywrap
%option extra-type="glcpp_parser_t *"
+ /* This lexer has two states:
+ *
+ * The CONTROL state is for control lines (directives)
+ * It lexes exactly as specified in the C99 specification.
+ *
+ * The INITIAL state is for input lines. In this state, we
+ * make the OTHER token much more broad in that it now
+ * includes tokens consisting entirely of whitespace. This
+ * allows us to pass text through verbatim. It avoids the
+ * "inadvertent token pasting" problem that would occur if we
+ * just printed tokens, while also avoiding excess whitespace
+ * insertion in the output.*/
+
+%x CONTROL
+
SPACE [[:space:]]
NONSPACE [^[:space:]]
NEWLINE [\n]
@@ -48,75 +63,104 @@ HEXADECIMAL_INTEGER 0[xX][0-9a-fA-F]+[uU]?
%%
{HASH}define{HSPACE}+/{IDENTIFIER}"(" {
+ BEGIN CONTROL;
return HASH_DEFINE_FUNC;
}
{HASH}define {
+ BEGIN CONTROL;
return HASH_DEFINE_OBJ;
}
{HASH}undef {
+ BEGIN CONTROL;
return HASH_UNDEF;
}
{HASH} {
+ BEGIN CONTROL;
return HASH;
}
-{IDENTIFIER} {
+<CONTROL>{IDENTIFIER} {
yylval.str = xtalloc_strdup (yyextra, yytext);
return IDENTIFIER;
}
-"<<" {
+<CONTROL>"<<" {
return LEFT_SHIFT;
}
-">>" {
+<CONTROL>">>" {
return RIGHT_SHIFT;
}
-"<=" {
+<CONTROL>"<=" {
return LESS_OR_EQUAL;
}
-">=" {
+<CONTROL>">=" {
return GREATER_OR_EQUAL;
}
-"==" {
+<CONTROL>"==" {
return EQUAL;
}
-"!=" {
+<CONTROL>"!=" {
return NOT_EQUAL;
}
-"&&" {
+<CONTROL>"&&" {
return AND;
}
-"||" {
+<CONTROL>"||" {
return OR;
}
-"##" {
+<CONTROL>"##" {
return PASTE;
}
-{PUNCTUATION} {
+<CONTROL>{PUNCTUATION} {
return yytext[0];
}
-\n {
+<CONTROL>{OTHER} {
+ yylval.str = xtalloc_strdup (yyextra, yytext);
+ return OTHER;
+}
+
+<CONTROL>{HSPACE}+
+
+<CONTROL>\n {
+ BEGIN INITIAL;
return NEWLINE;
}
-{OTHER} {
+{IDENTIFIER} {
+ yylval.str = xtalloc_strdup (yyextra, yytext);
+ return IDENTIFIER;
+}
+
+{OTHER}+ {
+ yylval.str = xtalloc_strdup (yyextra, yytext);
+ return OTHER;
+}
+
+{HSPACE}+ {
yylval.str = xtalloc_strdup (yyextra, yytext);
return OTHER;
}
-{HSPACE}+
+\n {
+ return NEWLINE;
+}
+
+. {
+ yylval.str = xtalloc_strdup (yyextra, yytext);
+ return OTHER;
+}
%%
diff --git a/glcpp-parse.y b/glcpp-parse.y
index 991b8a0b85..957421b864 100644
--- a/glcpp-parse.y
+++ b/glcpp-parse.y
@@ -517,8 +517,6 @@ _token_list_print (token_list_t *list)
for (node = list->head; node; node = node->next) {
_token_print (node->token);
- if (node->next)
- printf (" ");
}
}
diff --git a/tests/glcpp-test b/tests/glcpp-test
index 34cca88330..8074e47119 100755
--- a/tests/glcpp-test
+++ b/tests/glcpp-test
@@ -9,5 +9,5 @@ for test in *.c; do
gcc -E $test -o $test.gcc
# grep -v '^#' < $test.gcc > $test.expected
grep -v '^[ ]*#' < $test > $test.expected
- diff -w -u $test.expected $test.out
+ diff -u $test.expected $test.out
done