Το μεταεργαλείο lex

Διομήδης Σπινέλλης
Τμήμα Διοικητικής Επιστήμης και Τεχνολογίας
Οικονομικό Πανεπιστήμιο Αθηνών
dds@aueb.gr

Μεταεργαλεία

Το μεταεργαλείο lex

Διαδικασία χρήσης

Αρχείο εισόδου

Κανονικές εκφράσεις

Στο αρχείο εισόδου επιτρέπεται η παρακάτω σύνταξη για τις κανονικές εκφράσεις:
`x'
match the character `x'
`.'
any character (byte) except newline
`[xyz]'
a "character class"; in this case, the pattern matches either an `x', a `y', or a `z'
`[abj-oZ]'
a "character class" with a range in it; matches an `a', a `b', any letter from `j' through `o', or a `Z'
`[^A-Z]'
a "negated character class", i.e., any character but those in the class. In this case, any character EXCEPT an uppercase letter.
`[^A-Z\n]'
any character EXCEPT an uppercase letter or a newline
`r*'
zero or more r's, where r is any regular expression
`r+'
one or more r's
`r?'
zero or one r's (that is, "an optional r")
`r{2,5}'
anywhere from two to five r's
`r{2,}'
two or more r's
`r{4}'
exactly 4 r's
`{name}'
the expansion of the "name" definition (see above)
`"[xyz]\"foo"'
the literal string: `[xyz]"foo'
`\x'
if x is an `a', `b', `f', `n', `r', `t', or `v', then the ANSI-C interpretation of \x. Otherwise, a literal `x' (used to escape operators such as `*')
`\0'
a NUL character (ASCII code 0)
`\123'
the character with octal value 123
`\x2a'
the character with hexadecimal value 2a
`(r)'
match an r; parentheses are used to override precedence (see below)
`rs'
the regular expression r followed by the regular expression s; called "concatenation"
`r|s'
either an r or an s
`r/s'
an r but only if it is followed by an s. The text matched by s is included when determining whether this rule is the longest match, but is then returned to the input before the action is executed. So the action only sees the text matched by r. This type of pattern is called trailing context. (There are some combinations of `r/s' that flex cannot match correctly; see notes in the Deficiencies / Bugs section below regarding "dangerous trailing context".)
`^r'
an r, but only at the beginning of a line (i.e., which just starting to scan, or right after a newline has been scanned).
`r$'
an r, but only at the end of a line (i.e., just before a newline). Equivalent to "r/\n". Note that flex's notion of "newline" is exactly whatever the C compiler used to compile flex interprets '\n' as; in particular, on some DOS systems you must either filter out \r's in the input yourself, or explicitly use r/\r\n for "r$".
`<s>r'
an r, but only in start condition s (see below for discussion of start conditions) <s1,s2,s3>r same, but in any of start conditions s1, s2, or s3
`<*>r'
an r in any start condition, even an exclusive one.
`<<EOF>>'
an end-of-file <s1,s2><<EOF>> an end-of-file when in start condition s1 or s2

Τιμές προσβάσιμες στο χρήστη

Απλό παράδειγμα

       int num_lines = 0, num_chars = 0;

%%
\n      ++num_lines; ++num_chars;
.       ++num_chars;

%%
main()
        {
        yylex();
        printf( "# of lines = %d, # of chars = %d\n",
                num_lines, num_chars );
        }

Παράδειγμα χρήσης

athena:~/t> cat foo.l
%option noyywrap

        int num_lines = 0, num_chars = 0;

%%
\n      ++num_lines; ++num_chars;
.       ++num_chars;

%%
main()
        {
        yylex();
        printf( "# of lines = %d, # of chars = %d\n",
                num_lines, num_chars );
        }
athena:~/t> lex foo.l
athena:~/t> cc -o foo lex.yy.c
athena:~/t> foo <foo.l
# of lines = 15, # of chars = 261
athena:~/t>

Σύνθετο παράδειγμα

/* scanner for a toy Pascal-like language */

%{
/* need this for the call to atof() below */
#include <math.h>
%}

DIGIT    [0-9]
ID       [a-z][a-z0-9]*

%%

{DIGIT}+    {
            printf( "An integer: %s (%d)\n", yytext,
                    atoi( yytext ) );
            }

{DIGIT}+"."{DIGIT}*        {
            printf( "A float: %s (%g)\n", yytext,
                    atof( yytext ) );
            }

if|then|begin|end|procedure|function        {
            printf( "A keyword: %s\n", yytext );
            }

{ID}        printf( "An identifier: %s\n", yytext );

"+"|"-"|"*"|"/"   printf( "An operator: %s\n", yytext );

"{"[^}\n]*"}"     /* eat up one-line comments */

[ \t\n]+          /* eat up whitespace */

.           printf( "Unrecognized character: %s\n", yytext );

%%

main()
{
    yylex();
}

Τρόπος λειτουργίας

Παράδειγμα

Αρχείο εισόδου

%option noyywrap

%%
hi      printf("HI\n");
\.      printf("DOT\n");

%%
main()
        {
        yylex();
        }

Μη αιτιοκρατικό πεπερασμένο αυτόματο

Το αυτόματο αρχίζει από την κατάσταση 12. Από κάθε κατάσταση μπορεί να οδηγηθεί σε δύο νέες καταστάσεις Κ1, Κ2. (ε είναι το κενό σύμβολο).
                 συμβ.   Κ1    Κ2
Κατάσταση    1	 ε :     0,    0
Κατάσταση    2	 ε :     0,    0
Κατάσταση    3	 h :     4,    0	// Διάβασε h
Κατάσταση    4	 i :     5,    0	// Διάβασε i
Κατάσταση    5	 ε :     0,    0  [1]	// Αναγνώρισε hi
Κατάσταση    6	 ε :     1,    3
Κατάσταση    7	 . :     8,    0	// Διάβασε .
Κατάσταση    8	 ε :     0,    0  [2]	// Αναγνώρισε .
Κατάσταση    9	 ε :     6,    7
Κατάσταση   10	 EOF:    11,   0
Κατάσταση   11	 ε :     0,    0  [3]	// Τέλος του αρχείου
Κατάσταση   12	 ε :     9,   10

Κλάσεις ισοδυναμίας

\000 = -1  ' ' = 1     @ = 1     ` = 1 
\001 = 1     ! = 1     A = 1     a = 1
\002 = 1     " = 1     B = 1     b = 1
\003 = 1     # = 1     C = 1     c = 1
\004 = 1     $ = 1     D = 1     d = 1
\005 = 1     % = 1     E = 1     e = 1
\006 = 1     & = 1     F = 1     f = 1
  \a = 1     ' = 1     G = 1     g = 1
  \b = 1     ( = 1     H = 1     h = 3
  \t = 1     ) = 1     I = 1     i = 4
  \n = 1     * = 1     J = 1     j = 1
  \v = 1     + = 1     K = 1     k = 1
  \f = 1     , = 1     L = 1     l = 1
  \r = 1     - = 1     M = 1     m = 1
\016 = 1     . = 2     N = 1     n = 1
\017 = 1     / = 1     O = 1     o = 1
\020 = 1     0 = 1     P = 1     p = 1
\021 = 1     1 = 1     Q = 1     q = 1
\022 = 1     2 = 1     R = 1     r = 1
\023 = 1     3 = 1     S = 1     s = 1
\024 = 1     4 = 1     T = 1     t = 1
\025 = 1     5 = 1     U = 1     u = 1
\026 = 1     6 = 1     V = 1     v = 1
\027 = 1     7 = 1     W = 1     w = 1
\030 = 1     8 = 1     X = 1     x = 1
\031 = 1     9 = 1     Y = 1     y = 1
\032 = 1     : = 1     Z = 1     z = 1
\033 = 1     ; = 1     [ = 1     { = 1
\034 = 1     < = 1     \ = 1     | = 1
\035 = 1     = = 1     ] = 1     } = 1
\036 = 1     > = 1     ^ = 1     ~ = 1
\037 = 1     ? = 1     _ = 1  \177 = 1 

Μετακλάσεις ισοδυναμίας

1 = 1
2 = 1
3 = 1
4 = 2

Αιτιοκρατικό πεπερασμένο αυτόματο

Κατάσταση # 1:
	1	4
	2	5	// .
	3	6	// h
	4	4
Κατάσταση # 2:
	1	4
	2	5	// .
	3	6	// h
	4	4
Κατάσταση # 3:
Κατάσταση # 4:
Κατάσταση # 5:		// Αναγνώρισε .
Κατάσταση # 6:
	4	7	// i
Κατάσταση # 7:		// Αναγνώρισε hi

Παραγόμενος κώδικας σε C (τμήματα)


/* A lexical scanner generated by flex */

/* ... */

/* Πίνακες μετάβασης */

static yyconst short int yy_def[10] =
    {   0,
        8,    1,    8,    8,    8,    9,    8,    0,    8
    } ;

static yyconst short int yy_nxt[12] =
    {   0,
        4,    5,    6,    4,    7,    8,    3,    8,    8,    8,
        8
    } ;

static yyconst short int yy_chk[12] =
    {   0,
        1,    1,    1,    1,    9,    3,    8,    8,    8,    8,
        8
    } ;

/* ... */

/* Υλοποίηση του ΝΠΑ */

yy_match:
                do
                        {
                        register YY_CHAR yy_c = yy_ec[YY_SC_TO_UI(*yy_cp)];
                        while ( yy_chk[yy_base[yy_current_state] + yy_c] != yy_current_state )
                                {
                                yy_current_state = (int) yy_def[yy_current_state];
                                if ( yy_current_state >= 9 )
                                        yy_c = yy_meta[(unsigned int) yy_c];
                                }
                        yy_current_state = yy_nxt[yy_base[yy_current_state] + (unsigned int) yy_c];
                        *yy_state_ptr++ = yy_current_state;
                        ++yy_cp;
                        }

/* ... */

/* Υλοποίηση ενεργειών χρήστη */
                switch ( yy_act )
                        { /* beginning of action switch */
                case 1:
                        printf("HI\n");
                        break;
                case 2:
                        printf("DOT\n");
                        break;

/* ... */

/* Κώδικας χρήστη */

main()
        {
        yylex();
        }

Ασκήσεις

  1. Να υλοποιήσετε με το μεταεργαλείο lex έναν λεκτικό αναλυτή που να αναγνωρίζει τα παρακάτω λεκτικά σύμβολα: Ο λεκτικός αναλυτής θα τυπώνει στην έξοδό του τον τύπο του κάθε συμβόλου που θα αναγνωρίζει καθώς και την τιμή του. Παράδειγμα:
    Read integer (42)
    Read variable (i)
    Read keyword (while)
    

Βιβλιογραφία