美文网首页
词法语法解析

词法语法解析

作者: 消想 | 来源:发表于2021-07-22 14:39 被阅读0次
    • 熟练掌握词法、语法的解析流程及原理

    openGauss在执行SQL语句时,使用flex,bison对语句进行词法分析,语法分析
    词法语法分析的入口函数是raw_parser(parser.cpp),raw_parser调用base_yyparse进行词法语法分析
    -> scan.l: 词法文件,由flex编译生成scan.cpp
    -> gram.y: 语法文件,由bison编译生成gram.cpp
    -> kwlist.h: 列出所有关键字
    -> keywords.cpp: 常量定义
    -> kwlookup.cpp:二分法确认当前词是否关键字
    -> scansup.cpp:词法分析相关函数

    • scan.l
      scan.l识别SQL语句中的关键字,标识符,常量,操作符,终结符等。
    // flex由%%分为三个部分
    /* 定义段 */
    %{
    ...
    %}
    ...
    /* 规则段 */
    %%
    ...
    %%
    /* 用户子程序段 */
    

    相关数据结构

    // 关键字
    typedef struct ScanKeyword {
        const char* name; /* 名称:小写 */
        int16 value;      /* token */
        int16 category;   /* 类型 */
    } ScanKeyword;
    

    定义段

    %{
    ... // 定义宏,函数及include的文件
    %}
    
    %option reentrant  // 生成可重用的扫描器API
    %option bison-bridge // 生成的扫描器API能够被bision调用
    %option bison-locations
    %option 8bit // 8位扫描器
    %option never-interactive // 非交互式
    %option nodefault
    %option noinput
    %option nounput
    %option noyywrap // 不调用yywrap()
    %option noyyalloc
    %option noyyrealloc
    %option noyyfree
    %option warn
    %option prefix="core_yy" // yy开头的函数名替换为core_yy开头
    
    // 定义开始状态,对特定的规则进行匹配
    %x xb // 位串
    %x xc // 扩展C样式注释
    %x xd // 双引号标识符
    %x xh // 16进制数字字符串
    %x xe // 扩展引号字符串(支持反斜杠转义序列)
    %x xq // 标准引用字符串
    %x xdolq // $xxx$
    %x xui // unicode转义的标识符
    %x xus // unicode转义的字符串
    %x xeu // 扩展引号字符串中的Unicode代理项对
    
    // 匹配正则表达式
    // 空格,换行,备注
    space           [ \t\n\r\f]
    horiz_space     [ \t\f]
    newline         [\n\r]
    non_newline     [^\n\r]
    
    comment         ("--"{non_newline}*)
    
    whitespace      ({space}+|{comment})
    
    special_whitespace      ({space}+|{comment}{newline})
    horiz_whitespace        ({horiz_space}|{comment})
    whitespace_with_newline ({horiz_whitespace}*{newline}{special_whitespace}*)
    
    // 引号
    quote           '
    quotestop       {quote}{whitespace}*
    quotecontinue   {quote}{whitespace_with_newline}{quote}
    quotefail       {quote}{whitespace}*"-"
    
    // 位串
    xbstart         [bB]{quote}
    xbinside        [^']*
    
    // 16进制 
    xhstart         [xX]{quote}
    xhinside        [^']*
    
    // n' 这种
    xnstart         [nN]{quote}
    
    /* Quoted string that allows backslash escapes */
    xestart         [eE]{quote}  // e' 这种
    xeinside        [^\\']+
    xeescape        [\\][^0-7]
    xeoctesc        [\\][0-7]{1,3}
    xehexesc        [\\]x[0-9A-Fa-f]{1,2}
    xeunicode       [\\](u[0-9A-Fa-f]{4}|U[0-9A-Fa-f]{8})
    xeunicodefail   [\\](u[0-9A-Fa-f]{0,3}|U[0-9A-Fa-f]{0,7})
    
    /* Extended quote
     * xqdouble implements embedded quote, ''''
     */
    xqstart         {quote}
    xqdouble        {quote}{quote}
    xqinside        [^']+
    
    // $xxx$ 相关
    dolq_start      [A-Za-z\200-\377_]
    dolq_cont       [A-Za-z\200-\377_0-9]
    dolqdelim       \$({dolq_start}{dolq_cont}*)?\$
    dolqfailed      \${dolq_start}{dolq_cont}*
    dolqinside      [^$]+
    
    // 双引号
    dquote          \"
    xdstart         {dquote}
    xdstop          {dquote}
    xddouble        {dquote}{dquote}
    xdinside        [^"]+
    
    /* Unicode escapes */
    uescape         [uU][eE][sS][cC][aA][pP][eE]{whitespace}*{quote}[^']{quote}
    /* error rule to avoid backup */
    uescapefail     ("-"|[uU][eE][sS][cC][aA][pP][eE]{whitespace}*"-"|[uU][eE][sS][cC][aA][pP][eE]{whitespace}*{quote}[^']|[uU][eE][sS][cC][aA][pP][eE]{whitespace}*{quote}|[uU][eE][sS][cC][aA][pP][eE]{whitespace}*|[uU][eE][sS][cC][aA][pP]|[uU][eE][sS][cC][aA]|[uU][eE][sS][cC]|[uU][eE][sS]|[uU][eE]|[uU])
    
    /* Quoted identifier with Unicode escapes */
    xuistart        [uU]&{dquote}
    xuistop1        {dquote}{whitespace}*{uescapefail}?
    xuistop2        {dquote}{whitespace}*{uescape}
    
    /* Quoted string with Unicode escapes */
    xusstart        [uU]&{quote}
    xusstop1        {quote}{whitespace}*{uescapefail}?
    xusstop2        {quote}{whitespace}*{uescape}
    
    /* error rule to avoid backup */
    xufailed        [uU]&
    
    // C样式注释
    xcstart         \/\*{op_chars}*
    xcstop          \*+\/
    xcinside        [^*/]+
    
    digit           [0-9]
    ident_start     [A-Za-z\200-\377_]
    ident_cont      [A-Za-z\200-\377_0-9\$\#]
    
    identifier      {ident_start}{ident_cont}*
    
    typecast        "::"
    plus_join       "(+)"
    dot_dot         \.\.
    colon_equals    ":="
    para_equals "=>"
    
    /*
     * "self" is the set of chars that should be returned as single-character
     * tokens.  "op_chars" is the set of chars that can make up "Op" tokens,
     * which can be one or more characters long (but if a single-char token
     * appears in the "self" set, it is not to be returned as an Op).  Note
     * that the sets overlap, but each has some chars that are not in the other.
     *
     * If you change either set, adjust the character lists appearing in the
     * rule for "operator"!
     */
    self            [,()\[\].;\:\+\-\*\/\%\^\<\>\=]
    op_chars        [\~\!\@\#\^\&\|\`\?\+\-\*\/\%\<\>\=]
    operator        {op_chars}+
    
    /* we no longer allow unary minus in numbers.
     * instead we pass it separately to parser. there it gets
     * coerced via doNegate() -- Leon aug 20 1999
     *
    * {decimalfail} is used because we would like "1..10" to lex as 1, dot_dot, 10.
    *
     * {realfail1} and {realfail2} are added to prevent the need for scanner
     * backup when the {real} rule fails to match completely.
     */
    
    integer         {digit}+
    decimal         (({digit}*\.{digit}+)|({digit}+\.{digit}*))
    decimalfail     {digit}+\.\.
    real            ({integer}|{decimal})[Ee][-+]?{digit}+
    realfail1       ({integer}|{decimal})[Ee]
    realfail2       ({integer}|{decimal})[Ee][-+]
    
    param           \${integer}
    
    newParam        :({identifier}|{integer})
    
    newArray        :({integer}{space}*\])
    
    other           .
    

    规则段

    /* 规则 { 执行代码 } */
    %%
    
    {whitespace}    { /* 忽略空格,换行,备注 */ }
    // 匹配备注 /* */格式
    {xcstart}   {
                        SET_YYLLOC(); // 设置当前位置
                        yyextra->xcdepth = 0;
                        BEGIN(xc);
                        /* Put back any characters past slash-star; see above */
                        yyless(2); // 将当前token除前2个字符外的字符返回到输入流
                        if (yyextra->is_hint_str)
                        {
                            startlit();
                            addlit(yytext, yyleng, yyscanner);
                        }
            }
    
    <xc>{xcstart}   {
                        (yyextra->xcdepth)++;
                        /* Put back any characters past slash-star; see above */
                        yyless(2);
                        if (yyextra->is_hint_str)
                        {
                            addlit(yytext, yyleng, yyscanner);
                        }
            }
    
    <xc>{xcstop}    {
                        if (yyextra->xcdepth <= 0)
                            BEGIN(INITIAL);
                        else
                            (yyextra->xcdepth)--;
    
                        if (yyextra->is_hint_str)
                        {   
                            addlit(yytext, yyleng, yyscanner);
                            yylval->str = litbufdup(yyscanner);
                            yyextra->is_hint_str = false;
                            return COMMENTSTRING;
                        }
            }
    
    <xc>{xcinside}  {
                        if (yyextra->is_hint_str)
                        {
                            addlit(yytext, yyleng, yyscanner);
                        }
            }
    
    <xc>{op_chars}  {
                        if (yyextra->is_hint_str)
                        {
                            addlit(yytext, yyleng, yyscanner);
                        }
            }
    
    <xc>\*+     {
                        if (yyextra->is_hint_str)
                        {
                            addlit(yytext, yyleng, yyscanner);
                        }
            }
    
    <xc><<EOF>>     { yyerror("unterminated /* comment"); }
    // 匹配 b''
    {xbstart}       {
                        /* Binary bit type.
                         * At some point we should simply pass the string
                         * forward to the parser and label it there.
                         * In the meantime, place a leading "b" on the string
                         * to mark it for the input routine as a binary string.
                         */
                        SET_YYLLOC();
                        BEGIN(xb);
                        startlit();
                        addlitchar('b', yyscanner);
                    }
    <xb>{quotestop} |
    <xb>{quotefail} {
                        yyless(1);
                        BEGIN(INITIAL);
                        yylval->str = litbufdup(yyscanner);
                        yyextra->is_hint_str = false;
                        return BCONST;
            }
    <xh>{xhinside}  |
    <xb>{xbinside}  {
                        addlit(yytext, yyleng, yyscanner);
                    }
    <xh>{quotecontinue} |
    <xb>{quotecontinue} {
                        /* ignore */
                    }
    <xb><<EOF>>     { yyerror("unterminated bit string literal"); }
    // 匹配16进制  x''
    {xhstart}       {
                        /* Hexadecimal bit type.
                         * At some point we should simply pass the string
                         * forward to the parser and label it there.
                         * In the meantime, place a leading "x" on the string
                         * to mark it for the input routine as a hex string.
                         */
                        SET_YYLLOC();
                        BEGIN(xh);
                        startlit();
                        addlitchar('x', yyscanner);
                    }
    <xh>{quotestop} |
    <xh>{quotefail} {
                        yyless(1);
                        BEGIN(INITIAL);
                        yylval->str = litbufdup(yyscanner);
                        yyextra->is_hint_str = false;
                        return XCONST;
            }
    <xh><<EOF>>     { yyerror("unterminated hexadecimal string literal"); }
    // 匹配n''
    {xnstart}       {
                        /* National character.
                         * We will pass this along as a normal character string,
                         * but preceded with an internally-generated "NCHAR".
                         */
                        const ScanKeyword *keyword;
    
                        SET_YYLLOC();
                        yyless(1);              /* eat only 'n' this time */
    
                        keyword = ScanKeywordLookup("nchar",
                                                    yyextra->keywords,
                                                    yyextra->num_keywords);
                        if (keyword != NULL)
                        {
                            yylval->keyword = keyword->name;
                            yyextra->is_hint_str = false;
                            return keyword->value;
                        }
                        else
                        {
                            /* If NCHAR isn't a keyword, just return "n" */
                            yylval->str = pstrdup("n");
                            yyextra->ident_quoted = false;
                            yyextra->is_hint_str = false;
                            return IDENT;
                        }
                    }
    // 匹配 ''''
    {xqstart}       {
                        yyextra->warn_on_first_escape = true;
                        yyextra->saw_non_ascii = false;
                        SET_YYLLOC();
                        if (u_sess->attr.attr_sql.standard_conforming_strings)
                            BEGIN(xq);
                        else
                            BEGIN(xe);
                        startlit();
                    }
    {xestart}       {
                        yyextra->warn_on_first_escape = false;
                        yyextra->saw_non_ascii = false;
                        SET_YYLLOC();
                        BEGIN(xe);
                        startlit();
                    }
    {xusstart}      {
                        SET_YYLLOC();
                        if (!u_sess->attr.attr_sql.standard_conforming_strings)
                            ereport(ERROR,
                                    (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
                                     errmsg("unsafe use of string constant with Unicode escapes"),
                                     errdetail("String constants with Unicode escapes cannot be used when standard_conforming_strings is off."),
                                     lexer_errposition()));
                        BEGIN(xus);
                        startlit();
                    }
    <xq,xe>{quotestop}  |
    <xq,xe>{quotefail} {
                        yyless(1);
                        BEGIN(INITIAL);
                        /*
                         * check that the data remains valid if it might have been
                         * made invalid by unescaping any chars.
                         */
                        if (yyextra->saw_non_ascii)
                            pg_verifymbstr(yyextra->literalbuf,
                                           yyextra->literallen,
                                           false);
                        yylval->str = litbufdup(yyscanner);
                        yyextra->is_hint_str = false;
                        return SCONST;
                    }
    <xus>{xusstop1} {
                        /* throw back all but the quote */
                        yyless(1);
                        BEGIN(INITIAL);
                        yylval->str = litbuf_udeescape('\\', yyscanner);
                        yyextra->is_hint_str = false;
                        return SCONST;
            }
    <xus>{xusstop2} {
                        BEGIN(INITIAL);
                        yylval->str = litbuf_udeescape(yytext[yyleng-2], yyscanner);
                        yyextra->is_hint_str = false;
                        return SCONST;
            }
    <xq,xe,xus>{xqdouble} {
                        addlitchar('\'', yyscanner);
                    }
    <xq,xus>{xqinside}  {
                        addlit(yytext, yyleng, yyscanner);
                    }
    <xe>{xeinside}  {
                        addlit(yytext, yyleng, yyscanner);
                    }
    <xe>{xeunicode} {
                        pg_wchar c = strtoul(yytext+2, NULL, 16);
    
                        check_escape_warning(yyscanner);
    
                        if (is_utf16_surrogate_first(c))
                        {
                            yyextra->utf16_first_part = c;
                            BEGIN(xeu);
                        }
                        else if (is_utf16_surrogate_second(c))
                            yyerror("invalid Unicode surrogate pair");
                        else
                            addunicode(c, yyscanner);
                    }
    <xeu>{xeunicode} {
                        pg_wchar c = strtoul(yytext+2, NULL, 16);
    
                        if (!is_utf16_surrogate_second(c))
                            yyerror("invalid Unicode surrogate pair");
    
                        c = surrogate_pair_to_codepoint(yyextra->utf16_first_part, c);
    
                        addunicode(c, yyscanner);
    
                        BEGIN(xe);
                    }
    <xeu>.          { yyerror("invalid Unicode surrogate pair"); }
    <xeu>\n         { yyerror("invalid Unicode surrogate pair"); }
    <xeu><<EOF>>    { yyerror("invalid Unicode surrogate pair"); }
    <xe,xeu>{xeunicodefail} {
                            ereport(ERROR,
                                    (errcode(ERRCODE_INVALID_ESCAPE_SEQUENCE),
                                     errmsg("invalid Unicode escape"),
                                     errhint("Unicode escapes must be \\uXXXX or \\UXXXXXXXX."),
                                     lexer_errposition()));
                    }
    <xe>{xeescape}  {
                        if (yytext[1] == '\'')
                        {
                            if (u_sess->attr.attr_sql.backslash_quote == BACKSLASH_QUOTE_OFF ||
                                (u_sess->attr.attr_sql.backslash_quote == BACKSLASH_QUOTE_SAFE_ENCODING &&
                                 PG_ENCODING_IS_CLIENT_ONLY(pg_get_client_encoding())))
                                ereport(ERROR,
                                        (errcode(ERRCODE_NONSTANDARD_USE_OF_ESCAPE_CHARACTER),
                                         errmsg("unsafe use of \\' in a string literal"),
                                         errhint("Use '' to write quotes in strings. \\' is insecure in client-only encodings."),
                                         lexer_errposition()));
                        }
                        check_string_escape_warning(yytext[1], yyscanner);
                        addlitchar(unescape_single_char(yytext[1], yyscanner),
                                   yyscanner);
                    }
    <xe>{xeoctesc}  {
                        unsigned char c = strtoul(yytext+1, NULL, 8);
    
                        check_escape_warning(yyscanner);
                        addlitchar(c, yyscanner);
                        if (c == '\0' || IS_HIGHBIT_SET(c))
                            yyextra->saw_non_ascii = true;
                    }
    <xe>{xehexesc}  {
                        unsigned char c = strtoul(yytext+2, NULL, 16);
    
                        check_escape_warning(yyscanner);
                        addlitchar(c, yyscanner);
                        if (c == '\0' || IS_HIGHBIT_SET(c))
                            yyextra->saw_non_ascii = true;
                    }
    <xq,xe,xus>{quotecontinue} {
                        /* ignore */
                    }
    <xe>.           {
                        /* This is only needed for \ just before EOF */
                        addlitchar(yytext[0], yyscanner);
                    }
    <xq,xe,xus><<EOF>>      { yyerror("unterminated quoted string"); }
    // 匹配$xxx$
    {dolqdelim}     {
                        SET_YYLLOC();
                        yyextra->dolqstart = pstrdup(yytext);
                        BEGIN(xdolq);
                        startlit();
                    }
    {dolqfailed}    {
                        SET_YYLLOC();
                        /* throw back all but the initial "$" */
                        yyless(1);
                        /* and treat it as {other} */
                        yyextra->is_hint_str = false;
                        return yytext[0];
                    }
    <xdolq>{dolqdelim} {
                        if (strcmp(yytext, yyextra->dolqstart) == 0)
                        {
                            FREE_POINTER(yyextra->dolqstart);
                            yyextra->dolqstart = NULL;
                            BEGIN(INITIAL);
                            yylval->str = litbufdup(yyscanner);
                            yyextra->is_hint_str = false;
                            return SCONST;
                        }
                        else
                        {
                            /*
                             * When we fail to match $...$ to dolqstart, transfer
                             * the $... part to the output, but put back the final
                             * $ for rescanning.  Consider $delim$...$junk$delim$
                             */
                            addlit(yytext, yyleng-1, yyscanner);
                            yyless(yyleng-1);
                        }
                    }
    <xdolq>{dolqinside} {
                        addlit(yytext, yyleng, yyscanner);
                    }
    <xdolq>{dolqfailed} {
                        addlit(yytext, yyleng, yyscanner);
                    }
    <xdolq>.        {
                        /* This is only needed for $ inside the quoted text */
                        addlitchar(yytext[0], yyscanner);
                    }
    <xdolq><<EOF>>  { yyerror("unterminated dollar-quoted string"); }
    
    {xdstart}       {
                        SET_YYLLOC();
                        BEGIN(xd);
                        startlit();
                    }
    {xuistart}      {
                        SET_YYLLOC();
                        BEGIN(xui);
                        startlit();
                    }
    <xd>{xdstop}    {
                        char           *ident;
    
                        BEGIN(INITIAL);
                        if (yyextra->literallen == 0)
                            yyerror("zero-length delimited identifier");
                        ident = litbufdup(yyscanner);
                        if (yyextra->literallen >= NAMEDATALEN)
                            truncate_identifier(ident, yyextra->literallen, yyextra->warnOnTruncateIdent);
                        yylval->str = ident;
                        yyextra->ident_quoted = true;
                        yyextra->is_hint_str = false;
                        return IDENT;
                    }
    <xui>{xuistop1} {
                        char           *ident;
                        int             identlen;
    
                        BEGIN(INITIAL);
                        if (yyextra->literallen == 0)
                            yyerror("zero-length delimited identifier");
                        ident = litbuf_udeescape('\\', yyscanner);
                        identlen = strlen(ident);
                        if (identlen >= NAMEDATALEN)
                            truncate_identifier(ident, identlen, yyextra->warnOnTruncateIdent);
                        yylval->str = ident;
                        /* throw back all but the quote */
                        yyless(1);
                        yyextra->ident_quoted = false;
                        yyextra->is_hint_str = false;
                        return IDENT;
                    }
    <xui>{xuistop2} {
                        char           *ident;
                        int             identlen;
    
                        BEGIN(INITIAL);
                        if (yyextra->literallen == 0)
                            yyerror("zero-length delimited identifier");
                        ident = litbuf_udeescape(yytext[yyleng - 2], yyscanner);
                        identlen = strlen(ident);
                        if (identlen >= NAMEDATALEN)
                            truncate_identifier(ident, identlen, yyextra->warnOnTruncateIdent);
                        yylval->str = ident;
                        yyextra->ident_quoted = false;
                        yyextra->is_hint_str = false;
                        return IDENT;
                    }
    <xd,xui>{xddouble}  {
                        addlitchar('"', yyscanner);
                    }
    <xd,xui>{xdinside}  {
                        addlit(yytext, yyleng, yyscanner);
                    }
    <xd,xui><<EOF>>     { yyerror("unterminated quoted identifier"); }
    
    {xufailed}  {
                        char           *ident;
    
                        SET_YYLLOC();
                        /* throw back all but the initial u/U */
                        yyless(1);
                        /* and treat it as {identifier} */
                        ident = downcase_truncate_identifier(yytext, yyleng, yyextra->warnOnTruncateIdent);
                        yylval->str = ident;
                        yyextra->ident_quoted = false;
                        yyextra->is_hint_str = false;
                        return IDENT;
                    }
    // 匹配 ::
    {typecast}      {
                        SET_YYLLOC();
                        yyextra->is_hint_str = false;
                        return TYPECAST;
                    }
    // 匹配(+)
    {plus_join} {
                        SET_YYLLOC();
                        yyextra->is_hint_str = false;
                        return ORA_JOINOP;
                    }
    // 匹配 ..
    {dot_dot}       {
                        SET_YYLLOC();
                        yyextra->is_hint_str = false;
                        return DOT_DOT;
                    }
    // 匹配 :=
    {colon_equals}  {
                        SET_YYLLOC();
                        yyextra->is_hint_str = false;
                        return COLON_EQUALS;
                    }
    // 匹配 =>
    {para_equals}   {
                        SET_YYLLOC();
                        yyextra->is_hint_str = false;
                        return PARA_EQUALS;
                    }
    // 匹配单字符
    {self}          {
                        SET_YYLLOC();
                        /*
                         * Get the semicolon which is not in proc body nor in the '( )', treat it
                         * as end flag of a single query and store it in locationlist.
                         */
                        if (yyextra->dolqstart == NULL)
                        {
                            if (yytext[0] == '(')
                                yyextra->paren_depth++;
                            else if (yytext[0] == ')' && yyextra->paren_depth > 0)
                                yyextra->paren_depth--;
                            else if (yytext[0] == ';' && yyextra->paren_depth == 0 && !yyextra->in_slash_proc_body)
                                yyextra->query_string_locationlist = lappend_int(yyextra->query_string_locationlist, *yylloc);
                        }
                        yyextra->is_hint_str = false;
                        return yytext[0];
                    }
    // 匹配操作符
    {operator}      {
                        /*
                         * Check for embedded slash-star or dash-dash; those
                         * are comment starts, so operator must stop there.
                         * Note that slash-star or dash-dash at the first
                         * character will match a prior rule, not this one.
                         */
                        int     nchars = yyleng;
                        char   *slashstar = strstr(yytext, "/*");
                        char   *dashdash = strstr(yytext, "--");
    
                        if (slashstar && dashdash)
                        {
                            /* if both appear, take the first one */
                            if (slashstar > dashdash)
                                slashstar = dashdash;
                        }
                        else if (!slashstar)
                            slashstar = dashdash;
                        if (slashstar)
                            nchars = slashstar - yytext;
    
                        /*
                         * For SQL compatibility, '+' and '-' cannot be the
                         * last char of a multi-char operator unless the operator
                         * contains chars that are not in SQL operators.
                         * The idea is to lex '=-' as two operators, but not
                         * to forbid operator names like '?-' that could not be
                         * sequences of SQL operators.
                         */
                        while (nchars > 1 &&
                               (yytext[nchars-1] == '+' ||
                                yytext[nchars-1] == '-'))
                        {
                            int     ic;
    
                            for (ic = nchars-2; ic >= 0; ic--)
                            {
                                if (strchr("~!@#^&|`?%", yytext[ic]))
                                    break;
                            }
                            if (ic >= 0)
                                break; /* found a char that makes it OK */
                            nchars--; /* else remove the +/-, and check again */
                        }
    
                        SET_YYLLOC();
    
                        if (nchars < (int)yyleng)
                        {
                            /* Strip the unwanted chars from the token */
                            yyless(nchars);
                            /*
                             * If what we have left is only one char, and it's
                             * one of the characters matching "self", then
                             * return it as a character token the same way
                             * that the "self" rule would have.
                             */
                            if (nchars == 1 &&
                                strchr(",()[].;:+-*/%^<>=", yytext[0]))
                            {
                                yyextra->is_hint_str = false;
                                return yytext[0];
                            }
                        }
    
                        /*
                         * Complain if operator is too long.  Unlike the case
                         * for identifiers, we make this an error not a notice-
                         * and-truncate, because the odds are we are looking at
                         * a syntactic mistake anyway.
                         */
                        if (nchars >= NAMEDATALEN)
                            yyerror("operator too long");
    
                        /* Convert "!=" operator to "<>" for compatibility */
                        if (strcmp(yytext, "!=") == 0 || strcmp(yytext, "^=") == 0)
                        {
                            yylval->str = pstrdup("<>");
                            yyextra->is_hint_str = false;
                            return CmpOp;
                        }
                        else if (strcmp(yytext, ">=") == 0 || strcmp(yytext, "<=") == 0 || strcmp(yytext, "<>") == 0)
                        {
                            yylval->str = pstrdup(yytext);
                            yyextra->is_hint_str = false;
                            return CmpOp;
                        }
                        else
                            yylval->str = pstrdup(yytext);
                        yyextra->is_hint_str = false;
                        return Op;
                    }
    {newArray}      {
                        yyless(1);
                        yyextra->is_hint_str = false;
                        return yytext[0];
                    }
    // 匹配 $n
    {param}         {
                        SET_YYLLOC();
                        yylval->ival = getDynaParamSeq(yytext + 1, false, false, yyscanner);
                        yyextra->is_hint_str = false;
                        return PARAM;
                    }
    {newParam}      {
                        SET_YYLLOC();
                        yylval->ival = getDynaParamSeq(yytext + 1, false, true, yyscanner);
                        yyextra->is_hint_str = false;
                        return PARAM;
                    }
    // 匹配整数
    {integer}       {
                        SET_YYLLOC();
                        yyextra->is_hint_str = false;
                        return process_integer_literal(yytext, yylval);
                    }
    // 匹配浮点数
    {decimal}       {
                        SET_YYLLOC();
                        yylval->str = pstrdup(yytext);
                        yyextra->is_hint_str = false;
                        return FCONST;
                    }
    {decimalfail}   {
                        /* throw back the .., and treat as integer */
                        yyless(yyleng-2);
                        SET_YYLLOC();
                        yyextra->is_hint_str = false;
                        return process_integer_literal(yytext, yylval);
                    }
    {real}          {
                        SET_YYLLOC();
                        yylval->str = pstrdup(yytext);
                        yyextra->is_hint_str = false;
                        return FCONST;
                    }
    {realfail1}     {
                        /*
                         * throw back the [Ee], and treat as {decimal}.  Note
                         * that it is possible the input is actually {integer},
                         * but since this case will almost certainly lead to a
                         * syntax error anyway, we don't bother to distinguish.
                         */
                        yyless(yyleng-1);
                        SET_YYLLOC();
                        yylval->str = pstrdup(yytext);
                        yyextra->is_hint_str = false;
                        return FCONST;
                    }
    {realfail2}     {
                        /* throw back the [Ee][+-], and proceed as above */
                        yyless(yyleng-2);
                        SET_YYLLOC();
                        yylval->str = pstrdup(yytext);
                        yyextra->is_hint_str = false;
                        return FCONST;
                    }
    
    // 匹配关键字
    {identifier}    {
                        const ScanKeyword *keyword;
                        char           *ident;
    
                        SET_YYLLOC();
    
                        /* 二分法确认是否关键字 */
                        keyword = ScanKeywordLookup(yytext,
                                                    yyextra->keywords,
                                                    yyextra->num_keywords);
    
                        yyextra->is_hint_str = false;
    
                        if (keyword != NULL)
                        {
                            yylval->keyword = keyword->name;
    
                            /* Find the CREATE PROCEDURE syntax and set dolqstart. */
                            if (keyword->value == CREATE)
                            {
                                yyextra->is_createstmt = true;
                            }
                            else if (keyword->value == TRIGGER && yyextra->is_createstmt)
                            {
                                /* Create trigger don't need set dolqstart */
                                yyextra->is_createstmt = false;
                            }
                            else if ((keyword->value == PROCEDURE || keyword->value == FUNCTION)
                                     && yyextra->is_createstmt)
                            {
                                /* Make yyextra->dolqstart not NULL means its in a proc with $$. */
                                yyextra->dolqstart = "";
                            }
                            else if (keyword->value == BEGIN_P)
                            {
                                /* cases that have to be a trans stmt and fall quickly */
                                if (yyg->yy_hold_char == ';' || /* found ';' after 'begin' */
                                    yyg->yy_hold_char == '\0')  /* found '\0' after 'begin' */
                                    return BEGIN_NON_ANOYBLOCK;
                                /* look for other transaction stmt */
                                if (is_trans_stmt(yyextra->scanbuf, yyextra->scanbuflen))
                                    return BEGIN_NON_ANOYBLOCK;
                            }
                            else if (keyword->value == SELECT ||
                                     keyword->value == UPDATE||
                                     keyword->value == INSERT ||
                                     keyword->value == DELETE_P ||
                                     keyword->value == MERGE)
                            {
                                yyextra->is_hint_str = true;
                            }
    
                            return keyword->value;
                        }
    
                         /* 不是关键字,转换为小写,如果长度超过64进行截断 */ 
                        ident = downcase_truncate_identifier(yytext, yyleng, yyextra->warnOnTruncateIdent);
                        yylval->str = ident;
                        yyextra->ident_quoted = false;
                        return IDENT;
                    }
    
    {other}         {
                        SET_YYLLOC();
                        yyextra->is_hint_str = false;
                        return yytext[0];
                    }
    
    <<EOF>>         {
                        SET_YYLLOC();
                        yyterminate();
                    }
    
    %%
    

    程序段

    
    /*
     * Arrange access to yyextra for subroutines of the main yylex() function.
     * We expect each subroutine to have a yyscanner parameter.  Rather than
     * use the yyget_xxx functions, which might or might not get inlined by the
     * compiler, we cheat just a bit and cast yyscanner to the right type.
     */
    #undef yyextra
    #define yyextra  (((struct yyguts_t *) yyscanner)->yyextra_r)
    
    /* Likewise for a couple of other things we need. */
    #undef yylloc
    #define yylloc  (((struct yyguts_t *) yyscanner)->yylloc_r)
    #undef yyleng
    #define yyleng  (((struct yyguts_t *) yyscanner)->yyleng_r)
    
    
    // 返回词法或语法出错的位置
    int
    scanner_errposition(int location, core_yyscan_t yyscanner)
    {
        int     pos;
    
        if (location < 0)
            return 0;               /* no-op if location is unknown */
    
        /* Convert byte offset to character number */
        pos = pg_mbstrlen_with_len(yyextra->scanbuf, location) + 1;
        /* And pass it to the ereport mechanism */
        return errposition(pos);
    }
    
    // 报告词法或语法错误
    void
    scanner_yyerror(const char *message, core_yyscan_t yyscanner)
    {
        const char *loc = yyextra->scanbuf + *yylloc;
    
        if (*loc == YY_END_OF_BUFFER_CHAR)
        {
            ereport(ERROR,
                    (errcode(ERRCODE_SYNTAX_ERROR),
                     /* translator: %s is typically the translation of "syntax error" */
                     errmsg("%s at end of input", _(message)),
                     lexer_errposition()));
        }
        else
        {
            ereport(ERROR,
                    (errcode(ERRCODE_SYNTAX_ERROR),
                     /* translator: first %s is typically the translation of "syntax error" */
                     errmsg("%s at or near \"%s\"", _(message), loc),
                     lexer_errposition()));
        }
    }
    
    // 初始化flex扫描器
    core_yyscan_t
    scanner_init(const char *str,
                 core_yy_extra_type *yyext,
                 const ScanKeyword *keywords,
                 int num_keywords)
    {
        Size        slen = strlen(str);
        yyscan_t    scanner;
            
        // 初始化flex扫描器
        if (yylex_init(&scanner) != 0)
            ereport(ERROR,
                    (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
                        errmsg("yylex_init() failed: %m")));
    
        core_yyset_extra(yyext, scanner); // 将yyext赋值给scanner->yyextra
    
        yyext->keywords = keywords;  // 初始化关键字
        yyext->num_keywords = num_keywords; // 关键字数量
        yyext->in_slash_proc_body = false;
        yyext->paren_depth = 0;
        yyext->query_string_locationlist = NIL;
        yyext->is_createstmt = false;
        yyext->dolqstart = NULL;
        yyext->is_hint_str = false;
        yyext->parameter_list = NIL;
    
        /*
         * Make a scan buffer with special termination needed by flex.
         */
        yyext->scanbuf = (char *) palloc(slen + 2);
        yyext->scanbuflen = slen;
        memcpy(yyext->scanbuf, str, slen);
        yyext->scanbuf[slen] = yyext->scanbuf[slen + 1] = YY_END_OF_BUFFER_CHAR;
        yy_scan_buffer(yyext->scanbuf, slen + 2, scanner);
    
        /* initialize literal buffer to a reasonable but expansible size */
        yyext->literalalloc = 1024;
        yyext->literalbuf = (char *) palloc(yyext->literalalloc);
        yyext->literallen = 0;
        yyext->warnOnTruncateIdent = true;
    
        // Added CALL for procedure and function
        getDynaParamSeq("init", true, true, NULL);
    
        return scanner;
    }
    
    
    // 解析完成后释放内存
    void
    scanner_finish(core_yyscan_t yyscanner)
    {
        if (t_thrd.postgres_cxt.clear_key_memory)
        {
            errno_t rc = EOK;
            memset(yyextra->scanbuf, 0x7F, yyextra->scanbuflen);
            *(volatile char*)(yyextra->scanbuf) = *(volatile char*)(yyextra->scanbuf);
            rc = memset_s(yyextra->literalbuf, yyextra->literallen, 0x7F, yyextra->literallen);
            securec_check(rc, "\0", "\0");
        }
    
        /*
         * We don't bother to call yylex_destroy(), because all it would do
         * is pfree a small amount of control storage.  It's cheaper to leak
         * the storage until the parsing context is destroyed.  The amount of
         * space involved is usually negligible compared to the output parse
         * tree anyway.
         *
         * We do bother to pfree the scanbuf and literal buffer, but only if they
         * represent a nontrivial amount of space.  The 8K cutoff is arbitrary.
         */
        if (yyextra->scanbuflen >= 8192)
            FREE_POINTER(yyextra->scanbuf);
        if (yyextra->literalalloc >= 8192)
            FREE_POINTER(yyextra->literalbuf);
        if (yyextra->parameter_list)
        {
            list_free_deep(yyextra->parameter_list);
            yyextra->parameter_list = NIL;
        }
    }
    
    
    static void
    addlit(char *ytext, int yleng, core_yyscan_t yyscanner)
    {
        /* enlarge buffer if needed */
        if ((yyextra->literallen + yleng) >= yyextra->literalalloc)
        {
            do
            {
                yyextra->literalalloc *= 2;
            } while ((yyextra->literallen + yleng) >= yyextra->literalalloc);
    
            /*when yytext is larger than 512M, its double will exceed 1G, so we use repalloc_huge */
            yyextra->literalbuf = (char *) repalloc_huge(yyextra->literalbuf,
                                                    yyextra->literalalloc);
        }
        /* append new data */
        memcpy(yyextra->literalbuf + yyextra->literallen, ytext, yleng);
        yyextra->literallen += yleng;
    }
    
    
    static void
    addlitchar(unsigned char ychar, core_yyscan_t yyscanner)
    {
        /* enlarge buffer if needed */
        if ((yyextra->literallen + 1) >= yyextra->literalalloc)
        {
            yyextra->literalalloc *= 2;
            yyextra->literalbuf = (char *) repalloc(yyextra->literalbuf,
                                                    yyextra->literalalloc);
        }
        /* append new data */
        yyextra->literalbuf[yyextra->literallen] = ychar;
        yyextra->literallen += 1;
    }
    
    
    /*
     * Create a palloc'd copy of literalbuf, adding a trailing null.
     */
    static char *
    litbufdup(core_yyscan_t yyscanner)
    {
        int         llen = yyextra->literallen;
        char       *newm;
    
        newm = (char *)palloc(llen + 1);
        memcpy(newm, yyextra->literalbuf, llen);
        newm[llen] = '\0';
        return newm;
    }
    
    static int
    process_integer_literal(const char *token, YYSTYPE *lval)
    {
        long        val;
        char       *endptr;
    
        errno = 0;
        val = strtol(token, &endptr, 10);
        if (*endptr != '\0' || errno == ERANGE
    #ifdef HAVE_LONG_INT_64
            /* if long > 32 bits, check for overflow of int4 */
            || val != (long) ((int32) val)
    #endif
            )
        {
            /* integer too large, treat it as a float */
            lval->str = pstrdup(token);
            return FCONST;
        }
        lval->ival = val;
        return ICONST;
    }
    
    static unsigned int
    hexval(unsigned char c)
    {
        if (c >= '0' && c <= '9')
            return c - '0';
        if (c >= 'a' && c <= 'f')
            return c - 'a' + 0xA;
        if (c >= 'A' && c <= 'F')
            return c - 'A' + 0xA;
        ereport(ERROR,
            (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
            errmsg("invalid hexadecimal digit")));
        return 0; /* not reached */
    }
    
    static void
    check_unicode_value(pg_wchar c, const char *loc, core_yyscan_t yyscanner)
    {
        if (GetDatabaseEncoding() == PG_UTF8)
            return;
    
        if (c > 0x7F)
        {
            ADVANCE_YYLLOC(loc - yyextra->literalbuf + 3);   /* 3 for U&" */
            yyerror("Unicode escape values cannot be used for code point values above 007F when the server encoding is not UTF8");
        }
    }
    
    static bool
    is_utf16_surrogate_first(pg_wchar c)
    {
        return (c >= 0xD800 && c <= 0xDBFF);
    }
    
    static bool
    is_utf16_surrogate_second(pg_wchar c)
    {
        return (c >= 0xDC00 && c <= 0xDFFF);
    }
    
    static pg_wchar
    surrogate_pair_to_codepoint(pg_wchar first, pg_wchar second)
    {
        return ((first & 0x3FF) << 10) + 0x10000 + (second & 0x3FF);
    }
    
    static void
    addunicode(pg_wchar c, core_yyscan_t yyscanner)
    {
        char buf[8];
    
        if (c == 0 || c > 0x10FFFF)
            yyerror("invalid Unicode escape value");
        if (c > 0x7F)
        {
            if (GetDatabaseEncoding() != PG_UTF8)
                yyerror("Unicode escape values cannot be used for code point values above 007F when the server encoding is not UTF8");
            yyextra->saw_non_ascii = true;
        }
        unicode_to_utf8(c, (unsigned char *) buf);
        addlit(buf, pg_mblen(buf), yyscanner);
    }
    
    static char *
    litbuf_udeescape(unsigned char escape, core_yyscan_t yyscanner)
    {
        char *newm;
        char *litbuf, *in, *out;
        pg_wchar pair_first = 0;
    
        if (isxdigit(escape)
            || escape == '+'
            || escape == '\''
            || escape == '"'
            || scanner_isspace(escape))
        {
            ADVANCE_YYLLOC(yyextra->literallen + yyleng + 1);
            yyerror("invalid Unicode escape character");
        }
    
        /* Make literalbuf null-terminated to simplify the scanning loop */
        litbuf = yyextra->literalbuf;
        litbuf[yyextra->literallen] = '\0';
    
        /*
         * This relies on the subtle assumption that a UTF-8 expansion
         * cannot be longer than its escaped representation.
         */
        newm = (char *)palloc(yyextra->literallen + 1);
    
        in = litbuf;
        out = newm;
        while (*in)
        {
            if (in[0] == escape)
            {
                if (in[1] == escape)
                {
                    if (pair_first)
                    {
                        ADVANCE_YYLLOC(in - litbuf + 3);   /* 3 for U&" */
                        yyerror("invalid Unicode surrogate pair");
                    }
                    *out++ = escape;
                    in += 2;
                }
                else if (isxdigit((unsigned char) in[1]) &&
                         isxdigit((unsigned char) in[2]) &&
                         isxdigit((unsigned char) in[3]) &&
                         isxdigit((unsigned char) in[4]))
                {
                    pg_wchar unicode;
    
                    unicode = (hexval(in[1]) << 12) +
                        (hexval(in[2]) << 8) +
                        (hexval(in[3]) << 4) +
                        hexval(in[4]);
                    check_unicode_value(unicode, in, yyscanner);
                    if (pair_first)
                    {
                        if (is_utf16_surrogate_second(unicode))
                        {
                            unicode = surrogate_pair_to_codepoint(pair_first, unicode);
                            pair_first = 0;
                        }
                        else
                        {
                            ADVANCE_YYLLOC(in - litbuf + 3);   /* 3 for U&" */
                            yyerror("invalid Unicode surrogate pair");
                        }
                    }
                    else if (is_utf16_surrogate_second(unicode))
                        yyerror("invalid Unicode surrogate pair");
    
                    if (is_utf16_surrogate_first(unicode))
                        pair_first = unicode;
                    else
                    {
                        unicode_to_utf8(unicode, (unsigned char *) out);
                        out += pg_mblen(out);
                    }
                    in += 5;
                }
                else if (in[1] == '+' &&
                         isxdigit((unsigned char) in[2]) &&
                         isxdigit((unsigned char) in[3]) &&
                         isxdigit((unsigned char) in[4]) &&
                         isxdigit((unsigned char) in[5]) &&
                         isxdigit((unsigned char) in[6]) &&
                         isxdigit((unsigned char) in[7]))
                {
                    pg_wchar unicode;
    
                    unicode = (hexval(in[2]) << 20) +
                        (hexval(in[3]) << 16) +
                        (hexval(in[4]) << 12) +
                        (hexval(in[5]) << 8) +
                        (hexval(in[6]) << 4) +
                        hexval(in[7]);
                    check_unicode_value(unicode, in, yyscanner);
                    if (pair_first)
                    {
                        if (is_utf16_surrogate_second(unicode))
                        {
                            unicode = surrogate_pair_to_codepoint(pair_first, unicode);
                            pair_first = 0;
                        }
                        else
                        {
                            ADVANCE_YYLLOC(in - litbuf + 3);   /* 3 for U&" */
                            yyerror("invalid Unicode surrogate pair");
                        }
                    }
                    else if (is_utf16_surrogate_second(unicode))
                        yyerror("invalid Unicode surrogate pair");
    
                    if (is_utf16_surrogate_first(unicode))
                        pair_first = unicode;
                    else
                    {
                        unicode_to_utf8(unicode, (unsigned char *) out);
                        out += pg_mblen(out);
                    }
                    in += 8;
                }
                else
                {
                    ADVANCE_YYLLOC(in - litbuf + 3);   /* 3 for U&" */
                    yyerror("invalid Unicode escape value");
                }
            }
            else
            {
                if (pair_first)
                {
                    ADVANCE_YYLLOC(in - litbuf + 3);   /* 3 for U&" */
                    yyerror("invalid Unicode surrogate pair");
                }
                *out++ = *in++;
            }
        }
    
        /* unfinished surrogate pair? */
        if (pair_first)
        {
            ADVANCE_YYLLOC(in - litbuf + 3);            /* 3 for U&" */
            yyerror("invalid Unicode surrogate pair");
        }
    
        *out = '\0';
        /*
         * We could skip pg_verifymbstr if we didn't process any non-7-bit-ASCII
         * codes; but it's probably not worth the trouble, since this isn't
         * likely to be a performance-critical path.
         */
        pg_verifymbstr(newm, out - newm, false);
        return newm;
    }
    
    static unsigned char
    unescape_single_char(unsigned char c, core_yyscan_t yyscanner)
    {
        switch (c)
        {
            case 'b':
                return '\b';
            case 'f':
                return '\f';
            case 'n':
                return '\n';
            case 'r':
                return '\r';
            case 't':
                return '\t';
            default:
                /* check for backslash followed by non-7-bit-ASCII */
                if (c == '\0' || IS_HIGHBIT_SET(c))
                    yyextra->saw_non_ascii = true;
    
                return c;
        }
    }
    
    static void
    check_string_escape_warning(unsigned char ychar, core_yyscan_t yyscanner)
    {
        if (ychar == '\'')
        {
            if (yyextra->warn_on_first_escape && u_sess->attr.attr_sql.escape_string_warning)
                ereport(WARNING,
                        (errcode(ERRCODE_NONSTANDARD_USE_OF_ESCAPE_CHARACTER),
                         errmsg("nonstandard use of \\' in a string literal"),
                         errhint("Use '' to write quotes in strings, or use the escape string syntax (E'...')."),
                         lexer_errposition()));
            yyextra->warn_on_first_escape = false;  /* warn only once per string */
        }
        else if (ychar == '\\')
        {
            if (yyextra->warn_on_first_escape && u_sess->attr.attr_sql.escape_string_warning)
                ereport(WARNING,
                        (errcode(ERRCODE_NONSTANDARD_USE_OF_ESCAPE_CHARACTER),
                         errmsg("nonstandard use of \\\\ in a string literal"),
                         errhint("Use the escape string syntax for backslashes, e.g., E'\\\\'."),
                         lexer_errposition()));
            yyextra->warn_on_first_escape = false;  /* warn only once per string */
        }
        else
            check_escape_warning(yyscanner);
    }
    
    static void
    check_escape_warning(core_yyscan_t yyscanner)
    {
        if (yyextra->warn_on_first_escape && u_sess->attr.attr_sql.escape_string_warning)
            ereport(WARNING,
                    (errcode(ERRCODE_NONSTANDARD_USE_OF_ESCAPE_CHARACTER),
                     errmsg("nonstandard use of escape in a string literal"),
                     errhint("Use the escape string syntax for escapes, e.g., E'\\r\\n'."),
                     lexer_errposition()));
        yyextra->warn_on_first_escape = false;  /* warn only once per string */
    }
    
    /*
     * Interface functions to make flex use palloc() instead of malloc().
     * It'd be better to make these static, but flex insists otherwise.
     */
    
    void *
    core_yyalloc(yy_size_t bytes, core_yyscan_t yyscanner)
    {
        return palloc(bytes);
    }
    
    void *
    core_yyrealloc(void *ptr, yy_size_t bytes, core_yyscan_t yyscanner)
    {
        if (ptr)
            return repalloc(ptr, bytes);
        else
            return palloc(bytes);
    }
    
    void
    core_yyfree(void *ptr, core_yyscan_t yyscanner)
    {
        if (ptr)
            FREE_POINTER(ptr);
    }
    
    
    /*
     * @Description:  get the parameter sequence of dynamic SQL
     * @in string: parameter name
     * @in initflag:  mark the operation is init or not
     * @in placeholder: the flag to mark the binding parameter is placeholder or dollar quoting
     * @in yyscanner: for yyextra
     * @return - the sequence number of the parameter
     */
    long 
    getDynaParamSeq(const char *string, bool initflag, bool placeholder, core_yyscan_t yyscanner)
    {
        int result = 0;
        char* str = NULL;
        const ListCell *cell;
    
        if (initflag)
        { 
            u_sess->parser_cxt.has_dollar = false;
            u_sess->parser_cxt.has_placeholder = false;
            return 0;
        }
    
        if (placeholder == false)
        {
            if (u_sess->parser_cxt.has_placeholder)
                ereport(ERROR, 
                        (errcode(ERRCODE_SYNTAX_ERROR), 
                        errmsg("It is forbidden to use placeholder and dollar quoting together.")));
            u_sess->parser_cxt.has_dollar = true;
            return atol(string);
        }
    
        u_sess->parser_cxt.has_placeholder = true;
        if (u_sess->parser_cxt.has_dollar)
            ereport(ERROR, 
                    (errcode(ERRCODE_SYNTAX_ERROR), 
                    errmsg("It is forbidden to use placeholder and dollar quoting together.")));
    
    
        foreach(cell, yyextra->parameter_list)
        {
            result++;
            if (strcmp((char*)(lfirst(cell)),string) == 0)
                return result;
        }
    
        str = pstrdup(string);
        yyextra->parameter_list = lappend(yyextra->parameter_list, (void*)str);
    
        return result + 1;
    }
    
    /*
     * @Description: if we found begin, check if is a transaction stmt
     * @param[IN] haystack:  the give source string
     * @param[IN] haystack_len: the length of haystack. Note that haystack may have been separated into words by '\0',
                                so haystack_len is needed.
     * @return: true is a transaction stmt, false if not.
     *
     * we have to deal with a tricky case in which we recieve a sql like "begin   " which is not terminated with ';' and
     * followed by servral blank char. In this case we add a variable 'found_non_blank_char' to handle this case.
     * if we haven't found any non blank char in the sql, consider it to be a transaction stmt.
     */
    static bool
    is_trans_stmt(const char *haystack, int haystack_len)
    {
        char *tempstr = (char *)palloc0(haystack_len + 1);
        char *temp = tempstr;
        int line = 1; /* lineno of haystack which split by \0 */
        bool found_non_blank_char = false; /* mark if we find a non blank char after begin */
        errno_t rc = EOK;
    
        /* we have to make a copy, since haystack is const char* */
        rc = memcpy_s(tempstr, haystack_len + 1, haystack, haystack_len);
        securec_check_ss(rc, "\0", "\0");
    
        /* find if the 2nd line is prefixed by a valid transaction token */
        while (temp < tempstr + haystack_len)
        {
            /* there may be '\0' in the string, and should be skipped */
            if (*temp == '\0')
            {
                temp++;
                line++;
                /* we only search the 2nd line */
                if (line > 2)
                    break;
            }
            /* skip the blank char */
            else if (isspace(*temp))
            {
                temp++;
            }
            else
            {
                /* we found a non blank char after begin, do further checking */
                if (line == 2)
                    found_non_blank_char = true;
                /* For a transaction statement, all possible tokens after BEGIN are here */
                if (line == 2 &&(pg_strncasecmp(temp, "transaction", strlen("transaction")) == 0 ||
                                  pg_strncasecmp(temp, "work", strlen("work")) == 0 ||
                                  pg_strncasecmp(temp, "isolation", strlen("isolation")) == 0 ||
                                  pg_strncasecmp(temp, "read", strlen("read")) == 0 ||
                                  pg_strncasecmp(temp, "deferrable", strlen("deferrable")) == 0 ||
                                  pg_strncasecmp(temp, "not", strlen("not")) == 0 ||
                                  pg_strncasecmp(temp, ";", strlen(";")) == 0))
                {
                    FREE_POINTER(tempstr);
                    return true;
                }
    
                temp += strlen(temp);
            }
        }
    
        pfree (tempstr);
    
        /*
         * if all the char after begin are blank
         *    it is a trans stmt
         * else
         *    it is a anaynomous block stmt
         */
        return found_non_blank_char ? false : true;
    }
    
    
    • gram.y
      gram.y使用词法分析出的词(token)去匹配相应的语法规则,如果匹配成功,则生成抽象语法树。
      由于语法较多,这里以select语句解析为例
      相关数据结构
    typedef struct SelectStmt {
        NodeTag type;
    
        /*
         * These fields are used only in "leaf" SelectStmts.
         */
        List *distinctClause;   /* distinct子句 */
        IntoClause *intoClause; /* select into的目标值 */
        List *targetList;       /* 需要查询的字段 */
        List *fromClause;       /* from子句 */
        Node *whereClause;      /* where子句 */
        List *groupClause;      /* group by 子句 */
        Node *havingClause;     /* having条件子句 */
        List *windowClause;     /* 窗口函数 */
        WithClause *withClause; /* with子句 */
    
        /*
         * In a "leaf" node representing a VALUES list, the above fields are all
         * null, and instead this field is set.  Note that the elements of the
         * sublists are just expressions, without ResTarget decoration. Also note
         * that a list element can be DEFAULT (represented as a SetToDefault
         * node), regardless of the context of the VALUES list. It's up to parse
         * analysis to reject that where not valid.
         */
        List *valuesLists; /* untransformed list of expression lists */
    
        /*
         * These fields are used in both "leaf" SelectStmts and upper-level
         * SelectStmts.
         */
        List *sortClause;    /* sort 子句 */
        Node *limitOffset;   /* limit offset */
        Node *limitCount;    /* limit 返回行 */
        List *lockingClause; /* 锁子句 */
        HintState *hintState;
    
        /*
         * These fields are used only in upper-level SelectStmts.
         */
        SetOperation op;         /* 操作符 */
        bool all;                /* ALL specified? */
        struct SelectStmt *larg; /* left child */
        struct SelectStmt *rarg; /* right child */
    
        /*
         * These fields are used by operator "(+)"
         */
        bool hasPlus;
        /* Eventually add fields for CORRESPONDING spec here */
    } SelectStmt;
    

    定义段

    %{
    ... 定义宏,数据结构,函数及include文件
    %}
    
    %pure-parser
    %expect 0
    %name-prefix="base_yy"
    %locations
    
    %parse-param {core_yyscan_t yyscanner}
    %lex-param   {core_yyscan_t yyscanner}
    
    // 修改yylval的类型
    %union
    {
        core_YYSTYPE        core_yystype;
        /* these fields must match core_YYSTYPE: */
        int                 ival;
        char                *str;
        const char          *keyword;
    
        char                chr;
        bool                boolean;
        JoinType            jtype;
        DropBehavior        dbehavior;
        OnCommitAction      oncommit;
        List                *list;
        Node                *node;
        Value               *value;
        ObjectType          objtype;
        TypeName            *typnam;
        FunctionParameter   *fun_param;
        FunctionParameterMode fun_param_mode;
        FuncWithArgs        *funwithargs;
        DefElem             *defelt;
        SortBy              *sortby;
        WindowDef           *windef;
        JoinExpr            *jexpr;
        IndexElem           *ielem;
        Alias               *alias;
        RangeVar            *range;
        IntoClause          *into;
        WithClause          *with;
        A_Indices           *aind;
        ResTarget           *target;
        struct PrivTarget   *privtarget;
        AccessPriv          *accesspriv;
        InsertStmt          *istmt;
        VariableSetStmt     *vsetstmt;
    /* PGXC_BEGIN */
        DistributeBy        *distby;
        PGXCSubCluster      *subclus;
    /* PGXC_END */
        ForeignPartState    *foreignpartby;
        MergeWhenClause     *mergewhen;
        UpsertClause *upsert;
        EncryptionType algtype;
    }
    
    // 为与语法的每个部分相关联的值提供单独的类型
    %type <node>    stmt schema_stmt
            AlterDatabaseStmt AlterDatabaseSetStmt AlterDataSourceStmt 
    ...
    // 声明由LEX识别的YACC使用的每个语法规则,并给出值的类型
    %token <str>    IDENT FCONST SCONST BCONST XCONST Op CmpOp COMMENTSTRING
    ...
    // 关键字
    %token <keyword> ABORT_P ABSOLUTE_P ACCESS ACCOUNT ACTION ADD_P ADMIN AFTER
    ...
    
    /* Precedence: lowest to highest */
    %nonassoc   PARTIAL_EMPTY_PREC
    %nonassoc   CLUSTER
    %nonassoc   SET             /* see relation_expr_opt_alias */
    %left       UNION EXCEPT MINUS_P
    %left       INTERSECT
    %left       OR
    %left       AND
    %right      NOT
    %right      '='
    %nonassoc   '<' '>' CmpOp
    %nonassoc   LIKE ILIKE SIMILAR
    %nonassoc   ESCAPE
    %nonassoc   OVERLAPS
    %nonassoc   BETWEEN
    %nonassoc   IN_P
    %left       POSTFIXOP       /* dummy for postfix Op rules */
    /*
     * To support target_el without AS, we must give IDENT an explicit priority
     * between POSTFIXOP and Op.  We can safely assign the same priority to
     * various unreserved keywords as needed to resolve ambiguities (this can't
     * have any bad effects since obviously the keywords will still behave the
     * same as if they weren't keywords).  We need to do this for PARTITION,
     * RANGE, ROWS to support opt_existing_window_name; and for RANGE, ROWS
     * so that they can follow a_expr without creating postfix-operator problems;
     * and for NULL so that it can follow b_expr in ColQualList without creating
     * postfix-operator problems.
     *
     * To support CUBE and ROLLUP in GROUP BY without reserving them, we give them
     * an explicit priority lower than '(', so that a rule with CUBE '(' will shift
     * rather than reducing a conflicting rule that takes CUBE as a function name.
     * Using the same precedence as IDENT seems right for the reasons given above.
     *
     * The frame_bound productions UNBOUNDED PRECEDING and UNBOUNDED FOLLOWING
     * are even messier: since UNBOUNDED is an unreserved keyword (per spec!),
     * there is no principled way to distinguish these from the productions
     * a_expr PRECEDING/FOLLOWING.  We hack this up by giving UNBOUNDED slightly
     * lower precedence than PRECEDING and FOLLOWING.  At present this doesn't
     * appear to cause UNBOUNDED to be treated differently from other unreserved
     * keywords anywhere else in the grammar, but it's definitely risky.  We can
     * blame any funny behavior of UNBOUNDED on the SQL standard, though.
     */
    %nonassoc   UNBOUNDED       /* ideally should have same precedence as IDENT */
    %nonassoc   IDENT NULL_P PARTITION RANGE ROWS PRECEDING FOLLOWING CUBE ROLLUP
    %left       Op OPERATOR     /* multi-character ops and user-defined operators */
    %nonassoc   NOTNULL
    %nonassoc   ISNULL
    %nonassoc   IS              /* sets precedence for IS NULL, etc */
    %left       '+' '-'
    %left       '*' '/' '%'
    %left       '^'
    /* Unary Operators */
    %left       AT              /* sets precedence for AT TIME ZONE */
    %left       COLLATE
    %right      UMINUS
    %left       '[' ']'
    %left       '(' ')'
    %left       TYPECAST
    %left       '.'
    /*
     * These might seem to be low-precedence, but actually they are not part
     * of the arithmetic hierarchy at all in their use as JOIN operators.
     * We make them high-precedence to support their use as function names.
     * They wouldn't be given a precedence at all, were it not that we need
     * left-associativity among the JOIN rules themselves.
     */
    %left       JOIN CROSS LEFT FULL RIGHT INNER_P NATURAL ENCRYPTED
    /* kluge to keep xml_whitespace_option from causing shift/reduce conflicts */
    %right      PRESERVE STRIP_P
    

    规则段

    // 解析完成后赋值
    stmtblock:  stmtmulti
                {
                    pg_yyget_extra(yyscanner)->parsetree = $1;
                }
            ;
    ...
    // select语法
    SelectStmt: select_no_parens            %prec UMINUS
                | select_with_parens        %prec UMINUS
            ;
    
    select_with_parens:
                '(' select_no_parens ')'                { $$ = $2; }
                | '(' select_with_parens ')'            { $$ = $2; }
            ;
    
    select_no_parens:
                simple_select                       { $$ = $1; } // 简单查询
                | select_clause sort_clause // 带sort子句
                    {
                        insertSelectOptions((SelectStmt *) $1, $2, NIL,
                                            NULL, NULL, NULL,
                                            yyscanner);
                        $$ = $1;
                    }
                | select_clause opt_sort_clause for_locking_clause opt_select_limit  // 带锁子句
                    {
                        insertSelectOptions((SelectStmt *) $1, $2, $3,
                                            (Node*)list_nth($4, 0), (Node*)list_nth($4, 1),
                                            NULL,
                                            yyscanner);
                        $$ = $1;
                    }
                | select_clause opt_sort_clause select_limit opt_for_locking_clause // 带limit子句
                    {
                        insertSelectOptions((SelectStmt *) $1, $2, $4,
                                            (Node*)list_nth($3, 0), (Node*)list_nth($3, 1),
                                            NULL,
                                            yyscanner);
                        $$ = $1;
                    }
                | with_clause select_clause  // with cte
                    {
                        insertSelectOptions((SelectStmt *) $2, NULL, NIL,
                                            NULL, NULL,
                                            $1,
                                            yyscanner);
                        $$ = $2;
                    }
                | with_clause select_clause sort_clause // cte + sort子句
                    {
                        insertSelectOptions((SelectStmt *) $2, $3, NIL,
                                            NULL, NULL,
                                            $1,
                                            yyscanner);
                        $$ = $2;
                    }
                | with_clause select_clause opt_sort_clause for_locking_clause opt_select_limit  // cte + 锁子句
                    {
                        insertSelectOptions((SelectStmt *) $2, $3, $4,
                                            (Node*)list_nth($5, 0), (Node*)list_nth($5, 1),
                                            $1,
                                            yyscanner);
                        $$ = $2;
                    }
                | with_clause select_clause opt_sort_clause select_limit opt_for_locking_clause // cte + limit子句
                    {
                        insertSelectOptions((SelectStmt *) $2, $3, $5,
                                            (Node*)list_nth($4, 0), (Node*)list_nth($4, 1),
                                            $1,
                                            yyscanner);
                        $$ = $2;
                    }
            ;
    
    select_clause:
                simple_select                           { $$ = $1; }
                | select_with_parens                    { $$ = $1; }
            ;
    
    // 简单查询
    simple_select:
                SELECT hint_string opt_distinct target_list
                into_clause from_clause where_clause
                group_clause having_clause window_clause
                    {
                        // 新建SelectStmt节点
                        SelectStmt *n = makeNode(SelectStmt);
                        n->distinctClause = $3;
                        n->targetList = $4;
                        n->intoClause = $5;
                        n->fromClause = $6;
                        n->whereClause = $7;
                        n->groupClause = $8;
                        n->havingClause = $9;
                        n->windowClause = $10;
                        n->hintState = create_hintstate($2);
                        n->hasPlus = getOperatorPlusFlag();
                        $$ = (Node *)n;
                    }
                | values_clause                         { $$ = $1; }
                | TABLE relation_expr
                    {
                        /* same as SELECT * FROM relation_expr */
                        ColumnRef *cr = makeNode(ColumnRef);
                        ResTarget *rt = makeNode(ResTarget);
                        SelectStmt *n = makeNode(SelectStmt);
    
                        cr->fields = list_make1(makeNode(A_Star));
                        cr->location = -1;
    
                        rt->name = NULL;
                        rt->indirection = NIL;
                        rt->val = (Node *)cr;
                        rt->location = -1;
    
                        n->targetList = list_make1(rt);
                        n->fromClause = list_make1($2);
                        $$ = (Node *)n;
                    }
                | select_clause UNION opt_all select_clause // select union select
                    {
                        $$ = makeSetOp(SETOP_UNION, $3, $1, $4);
                    }
                | select_clause INTERSECT opt_all select_clause // select intersect select
                    {
                        $$ = makeSetOp(SETOP_INTERSECT, $3, $1, $4);
                    }
                | select_clause EXCEPT opt_all select_clause // select except select
                    {
                        $$ = makeSetOp(SETOP_EXCEPT, $3, $1, $4);
                    }
                | select_clause MINUS_P opt_all select_clause // select minus select 
                    {
                        $$ = makeSetOp(SETOP_EXCEPT, $3, $1, $4);
                    }
            ;
    
    hint_string: // hint
            COMMENTSTRING
                {
                    $$ = $1;
                }
            |
                { 
                    $$ = NULL;
                }
            ;
    /*
     * SQL standard WITH clause looks like:
     *
     * WITH [ RECURSIVE ] <query name> [ (<column>,...) ]
     *      AS (query) [ SEARCH or CYCLE clause ]
     *
     * We don't currently support the SEARCH or CYCLE clause.
     */
    with_clause: // cte
            WITH cte_list
                {
                    $$ = makeNode(WithClause);
                    $$->ctes = $2;
                    $$->recursive = false;
                    $$->location = @1;
                }
            | WITH RECURSIVE cte_list
                {
                    $$ = makeNode(WithClause);
                    $$->ctes = $3;
                    $$->recursive = true;
                    $$->location = @1;
                }
            ;
    
    cte_list:
            common_table_expr                       { $$ = list_make1($1); }
            | cte_list ',' common_table_expr        { $$ = lappend($1, $3); }
            ;
    
    common_table_expr:  name opt_name_list AS '(' PreparableStmt ')'
                {
                    CommonTableExpr *n = makeNode(CommonTableExpr);
                    n->ctename = $1;
                    n->aliascolnames = $2;
                    n->ctequery = $5;
                    n->location = @1;
                    n->locator_type = LOCATOR_TYPE_NONE;
                    $$ = (Node *) n;
                }
            ;
    
    opt_with_clause:
            with_clause                             { $$ = $1; }
            | /*EMPTY*/                             { $$ = NULL; }
            ;
    
    into_clause:
                INTO OptTempTableName
                    {
                        $$ = makeNode(IntoClause);
                        $$->rel = $2;
                        $$->colNames = NIL;
                        $$->options = NIL;
                        $$->onCommit = ONCOMMIT_NOOP;
                        /* Here $$ is a temp table, so row_compress can be any value. To be safe, REL_CMPRS_PAGE_PLAIN is used. */
                        $$->row_compress = REL_CMPRS_PAGE_PLAIN;
                        $$->tableSpaceName = NULL;
                        $$->skipData = false;
                        $$->relkind = INTO_CLAUSE_RELKIND_DEFAULT;
                    }
                | /*EMPTY*/
                    { $$ = NULL; }
            ;
    
    /*
     * Redundancy here is needed to avoid shift/reduce conflicts,
     * since TEMP is not a reserved word.  See also OptTemp.
     */
    OptTempTableName:
                TEMPORARY opt_table qualified_name
                    {
                        $$ = $3;
                        $$->relpersistence = RELPERSISTENCE_TEMP;
                    }
                | TEMP opt_table qualified_name
                    {
                        $$ = $3;
                        $$->relpersistence = RELPERSISTENCE_TEMP;
                    }
                | LOCAL TEMPORARY opt_table qualified_name
                    {
                        $$ = $4;
                        $$->relpersistence = RELPERSISTENCE_TEMP;
                    }
                | LOCAL TEMP opt_table qualified_name
                    {
                        $$ = $4;
                        $$->relpersistence = RELPERSISTENCE_TEMP;
                    }
                | GLOBAL TEMPORARY opt_table qualified_name
                    {
                        $$ = $4;
    #ifdef ENABLE_MULTIPLE_NODES
                        ereport(WARNING,
                                (errmsg("GLOBAL is deprecated in temporary table creation"),
                                 parser_errposition(@1)));
                        $$->relpersistence = RELPERSISTENCE_TEMP;
    #else
                        $$->relpersistence = RELPERSISTENCE_GLOBAL_TEMP;
    #endif
                    }
                | GLOBAL TEMP opt_table qualified_name
                    {
                        $$ = $4;
    #ifdef ENABLE_MULTIPLE_NODES
                        ereport(WARNING,
                                (errmsg("GLOBAL is deprecated in temporary table creation"),
                                 parser_errposition(@1)));
                        $$->relpersistence = RELPERSISTENCE_TEMP;
    #else
                        $$->relpersistence = RELPERSISTENCE_GLOBAL_TEMP;
    #endif
                    }
                | UNLOGGED opt_table qualified_name
                    {
                        $$ = $3;
                        $$->relpersistence = RELPERSISTENCE_UNLOGGED;
                    }
                | TABLE qualified_name
                    {
                        $$ = $2;
                        $$->relpersistence = RELPERSISTENCE_PERMANENT;
                    }
                | qualified_name
                    {
                        $$ = $1;
                        $$->relpersistence = RELPERSISTENCE_PERMANENT;
                    }
            ;
    
    opt_table:  TABLE                                   {}
                | /*EMPTY*/                             {}
            ;
    
    opt_all:    ALL                                     { $$ = TRUE; }
                | DISTINCT                              { $$ = FALSE; }
                | /*EMPTY*/                             { $$ = FALSE; }
            ;
    
    /* We use (NIL) as a placeholder to indicate that all target expressions
     * should be placed in the DISTINCT list during parsetree analysis.
     */
    opt_distinct: // distinct子句
                DISTINCT                                { $$ = list_make1(NIL); }
                | DISTINCT ON '(' expr_list ')'         { $$ = $4; }
                | ALL                                   { $$ = NIL; }
                | /*EMPTY*/                             { $$ = NIL; }
            ;
    
    opt_sort_clause:
                sort_clause                             { $$ = $1;}
                | /*EMPTY*/                             { $$ = NIL; }
            ;
    
    sort_clause:  // sort子句
                ORDER BY sortby_list                    { $$ = $3; }
            ;
    
    sortby_list:
                sortby                                  { $$ = list_make1($1); }
                | sortby_list ',' sortby                { $$ = lappend($1, $3); }
            ;
    
    sortby:     a_expr USING qual_all_Op opt_nulls_order
                    {
                        $$ = makeNode(SortBy);
                        $$->node = $1;
                        $$->sortby_dir = SORTBY_USING;
                        $$->sortby_nulls = (SortByNulls)$4;
                        $$->useOp = $3;
                        $$->location = @3;
                    }
                | a_expr opt_asc_desc opt_nulls_order
                    {
                        $$ = makeNode(SortBy);
                        $$->node = $1;
                        $$->sortby_dir = (SortByDir)$2;
                        $$->sortby_nulls = (SortByNulls)$3;
                        $$->useOp = NIL;
                        $$->location = -1;      /* no operator */
                    }
                | NLSSORT '(' a_expr ',' Sconst ')' opt_asc_desc opt_nulls_order
                    {
                        if (checkNlssortArgs($5))
                        {
                            Node  *c = NULL;
                            FuncCall *n = makeNode(FuncCall);
                            c = $3;
    
                            n->funcname = SystemFuncName("convert_to_nocase");
                            n->args =list_make2(c,makeStringConst("gbk",-1));
                            n->agg_order = NIL;
                            n->agg_star = FALSE;
                            n->agg_distinct = FALSE;
                            n->func_variadic = FALSE;
                            n->over = NULL;
                            n->location = @1;
                            n->call_func = false;
    
                            $$ = makeNode(SortBy);
                            $$->node = (Node*)n;
                            $$->sortby_dir = (SortByDir)$7;
                            $$->sortby_nulls = (SortByNulls)$8;
                            $$->useOp = NIL;
                            $$->location = @1;
                        }
                        else
                        {
                            $$ = NULL;
                            ereport(ERROR,(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
                                        errmsg("Sort method %s  is not supported!",$5)));
                        }
                    }
            ;
    
    
    select_limit: // limit子句
                limit_clause offset_clause              { $$ = list_make2($2, $1); }
                | offset_clause limit_clause                { $$ = list_make2($1, $2); }
                | limit_clause                      { $$ = list_make2(NULL, $1); }
                | limit_offcnt_clause                   { $$ = $1; }
                | offset_clause                     { $$ = list_make2($1, NULL); }
            ;
    
    opt_select_limit:
                select_limit                        { $$ = $1; }
                | /* EMPTY */                       { $$ = list_make2(NULL,NULL); }
            ;
    
    opt_delete_limit:
                LIMIT a_expr                        { $$ = list_make2(NULL, $2); }
                | /* EMPTY */                       { $$ = list_make2(NULL, NULL); }
    
    
    limit_clause:
                LIMIT select_limit_value
                    { $$ = $2; }
                /* SQL:2008 syntax */
                | FETCH first_or_next opt_select_fetch_first_value row_or_rows ONLY
                    { $$ = $3; }
            ;
    
    limit_offcnt_clause:
                LIMIT select_offset_value ',' select_limit_value
                    {
                        $$ = list_make2($2, $4);
                    }
            ;
    
    offset_clause:
                OFFSET select_offset_value
                    { $$ = $2; }
                /* SQL:2008 syntax */
                | OFFSET select_offset_value2 row_or_rows
                    { $$ = $2; }
            ;
    
    select_limit_value:
                a_expr                                  { $$ = $1; }
                | ALL
                    {
                        /* LIMIT ALL is represented as a NULL constant */
                        $$ = makeNullAConst(@1);
                    }
            ;
    
    select_offset_value:
                a_expr                                  { $$ = $1; }
            ;
    
    /*
     * Allowing full expressions without parentheses causes various parsing
     * problems with the trailing ROW/ROWS key words.  SQL only calls for
     * constants, so we allow the rest only with parentheses.  If omitted,
     * default to 1.
     */
    opt_select_fetch_first_value:
                SignedIconst                        { $$ = makeIntConst($1, @1); }
                | '(' a_expr ')'                    { $$ = $2; }
                | /*EMPTY*/                         { $$ = makeIntConst(1, -1); }
            ;
    
    /*
     * Again, the trailing ROW/ROWS in this case prevent the full expression
     * syntax.  c_expr is the best we can do.
     */
    select_offset_value2:
                c_expr                                  { $$ = $1; }
            ;
    
    /* noise words */
    row_or_rows: ROW                                    { $$ = 0; }
                | ROWS                                  { $$ = 0; }
            ;
    
    first_or_next: FIRST_P                              { $$ = 0; }
                | NEXT                                  { $$ = 0; }
            ;
    
    /*
     * This syntax for group_clause tries to follow the spec quite closely.
     * However, the spec allows only column references, not expressions,
     * which introduces an ambiguity between implicit row constructors
     * (a,b) and lists of column references.
     *
     * We handle this by using the a_expr production for what the spec calls
     * <ordinary grouping set>, which in the spec represents either one column
     * reference or a parenthesized list of column references. Then, we check the
     * top node of the a_expr to see if it's an implicit RowExpr, and if so, just
     * grab and use the list, discarding the node. (this is done in parse analysis,
     * not here)
     *
     * (we abuse the row_format field of RowExpr to distinguish implicit and
     * explicit row constructors; it's debatable if anyone sanely wants to use them
     * in a group clause, but if they have a reason to, we make it possible.)
     *
     * Each item in the group_clause list is either an expression tree or a
     * GroupingSet node of some type.
     */
    
    group_clause: // group by 子句
                GROUP_P BY group_by_list                { $$ = $3; }
                | /*EMPTY*/                             { $$ = NIL; }
            ;
    group_by_list:
                group_by_item                           { $$ = list_make1($1); }
                | group_by_list ',' group_by_item       { $$ = lappend($1,$3); }
            ;
    
    group_by_item:
                a_expr                                  { $$ = $1; }
                | empty_grouping_set                    { $$ = $1; }
                | cube_clause                           { $$ = $1; }
                | rollup_clause                         { $$ = $1; }
                | grouping_sets_clause                  { $$ = $1; }
            ;
    
    empty_grouping_set:
                '(' ')'
                    {
                        $$ = (Node *) makeGroupingSet(GROUPING_SET_EMPTY, NIL, @1);
                    }
            ;
    /*
     * These hacks rely on setting precedence of CUBE and ROLLUP below that of '(',
     * so that they shift in these rules rather than reducing the conflicting
     * unreserved_keyword rule.
     */
    
    rollup_clause:
                ROLLUP '(' expr_list ')'
                    {
                        $$ = (Node *) makeGroupingSet(GROUPING_SET_ROLLUP, $3, @1);
                    }
            ;
    
    cube_clause:
                CUBE '(' expr_list ')'
                    {
                        $$ = (Node *) makeGroupingSet(GROUPING_SET_CUBE, $3, @1);
                    }
            ;
    
    grouping_sets_clause:
                GROUPING_P SETS '(' group_by_list ')'
                    {
                        $$ = (Node *) makeGroupingSet(GROUPING_SET_SETS, $4, @1);
                    }
            ;
    
    
    having_clause:
                HAVING a_expr                           { $$ = $2; }
                | /*EMPTY*/                             { $$ = NULL; }
            ;
    
    for_locking_clause: // 锁子句
                for_locking_items                       { $$ = $1; }
                | FOR READ ONLY                         { $$ = NIL; }
            ;
    
    opt_for_locking_clause:
                for_locking_clause                      { $$ = $1; }
                | /* EMPTY */                           { $$ = NIL; }
            ;
    
    for_locking_items:
                for_locking_item                        { $$ = list_make1($1); }
                | for_locking_items for_locking_item    { $$ = lappend($1, $2); }
            ;
    
    for_locking_item:
                FOR UPDATE locked_rels_list opt_nowait
                    {
                        LockingClause *n = makeNode(LockingClause);
                        n->lockedRels = $3;
                        n->forUpdate = TRUE;
                        n->noWait = $4;
                        $$ = (Node *) n;
                    }
                | FOR SHARE locked_rels_list opt_nowait
                    {
                        LockingClause *n = makeNode(LockingClause);
                        n->lockedRels = $3;
                        n->forUpdate = FALSE;
                        n->noWait = $4;
                        $$ = (Node *) n;
                    }
            ;
    
    locked_rels_list:
                OF qualified_name_list                  { $$ = $2; }
                | /* EMPTY */                           { $$ = NIL; }
            ;
    
    
    values_clause: // VALUES子句
                VALUES ctext_row
                    {
                        SelectStmt *n = makeNode(SelectStmt);
                        n->valuesLists = list_make1($2);
                        $$ = (Node *) n;
                    }
                | values_clause ',' ctext_row
                    {
                        SelectStmt *n = (SelectStmt *) $1;
                        n->valuesLists = lappend(n->valuesLists, $3);
                        $$ = (Node *) n;
                    }
            ;
    
    
    /*****************************************************************************
     *
     *  clauses common to all Optimizable Stmts:
     *      from_clause     - allow list of both JOIN expressions and table names
     *      where_clause    - qualifications for joins or restrictions
     *
     *****************************************************************************/
    
    from_clause: // from子句
                FROM from_list                          { $$ = $2; }
                | /*EMPTY*/                             { $$ = NIL; }
            ;
    
    from_list:
                table_ref                               { $$ = list_make1($1); }
                | from_list ',' table_ref               { $$ = lappend($1, $3); }
            ;
    
    /*
     * table_ref is where an alias clause can be attached.  Note we cannot make
     * alias_clause have an empty production because that causes parse conflicts
     * between table_ref := '(' joined_table ')' alias_clause
     * and joined_table := '(' joined_table ')'.  So, we must have the
     * redundant-looking productions here instead.
     */
    // 访问表
    table_ref:  relation_expr
                    {
                        $$ = (Node *) $1;
                    }
                | relation_expr alias_clause
                    {
                        $1->alias = $2;
                        $$ = (Node *) $1;
                    }
                | relation_expr opt_alias_clause tablesample_clause
                    {
                        RangeTableSample *n = (RangeTableSample *) $3;
                        $1->alias = $2;
                        /* relation_expr goes inside the RangeTableSample node */
                        n->relation = (Node *) $1;
                        $$ = (Node *) n;
                    }
    
                | relation_expr PARTITION '(' name ')'
                    {
                        $1->partitionname = $4;
                        $1->ispartition = true;
                        $$ = (Node *)$1;
                    }
                | relation_expr BUCKETS '(' bucket_list ')'
                    {
                        $1->buckets = $4;
                        $1->isbucket = true;
                        $$ = (Node *)$1;
                    }
                | relation_expr PARTITION_FOR '(' maxValueList ')'
                    {
                        $1->partitionKeyValuesList = $4;
                        $1->ispartition = true;
                        $$ = (Node *)$1;
                    }
    
                | relation_expr PARTITION '(' name ')' alias_clause
                    {
                        $1->partitionname = $4;
                        $1->alias = $6;
                        $1->ispartition = true;
                        $$ = (Node *)$1;
                    }
    
                | relation_expr PARTITION_FOR '(' maxValueList ')' alias_clause
                    {
                        $1->partitionKeyValuesList = $4;
                        $1->alias = $6;
                        $1->ispartition = true;
                        $$ = (Node *)$1;
                    }
    
                | func_table
                    {
                        RangeFunction *n = makeNode(RangeFunction);
                        n->funccallnode = $1;
                        n->coldeflist = NIL;
                        $$ = (Node *) n;
                    }
                | func_table alias_clause
                    {
                        RangeFunction *n = makeNode(RangeFunction);
                        n->funccallnode = $1;
                        n->alias = $2;
                        n->coldeflist = NIL;
                        $$ = (Node *) n;
                    }
                | func_table AS '(' TableFuncElementList ')'
                    {
                        RangeFunction *n = makeNode(RangeFunction);
                        n->funccallnode = $1;
                        n->coldeflist = $4;
                        $$ = (Node *) n;
                    }
                | func_table AS ColId '(' TableFuncElementList ')'
                    {
                        RangeFunction *n = makeNode(RangeFunction);
                        Alias *a = makeNode(Alias);
                        n->funccallnode = $1;
                        a->aliasname = $3;
                        n->alias = a;
                        n->coldeflist = $5;
                        $$ = (Node *) n;
                    }
                | func_table ColId '(' TableFuncElementList ')'
                    {
                        RangeFunction *n = makeNode(RangeFunction);
                        Alias *a = makeNode(Alias);
                        n->funccallnode = $1;
                        a->aliasname = $2;
                        n->alias = a;
                        n->coldeflist = $4;
                        $$ = (Node *) n;
                    }
                | select_with_parens
                    {
                        /*
                         * The SQL spec does not permit a subselect
                         * (<derived_table>) without an alias clause,
                         * so we don't either.  This avoids the problem
                         * of needing to invent a unique refname for it.
                         * That could be surmounted if there's sufficient
                         * popular demand, but for now let's just implement
                         * the spec and see if anyone complains.
                         * However, it does seem like a good idea to emit
                         * an error message that's better than "syntax error".
                         */
                        /* add select_with_parens whthout alias_clause adapt A db for procedure dubug */
                        $$ = NULL;
                        if (IsA($1, SelectStmt) &&
                            ((SelectStmt *) $1)->valuesLists)
                            ereport(ERROR,
                                    (errcode(ERRCODE_SYNTAX_ERROR),
                                     errmsg("VALUES in FROM must have an alias"),
                                     errhint("For example, FROM (VALUES ...) [AS] foo."),
                                     parser_errposition(@1)));
                        else
                        {
                            /*
                            * add a anonymous table name for this subquery
                            * simulate A db to support no alias for subquery,
                            * give the suqquery a default name "anonymous_table"
                            */
                            RangeSubselect *n = makeNode(RangeSubselect);
                            Alias *a = makeNode(Alias);
                            n->subquery = $1;
                            n->alias = NULL;
                            a->aliasname = pstrdup("__unnamed_subquery__");
                            n->alias = a;
                            $$ = (Node *) n;
                        }
                    }
                | select_with_parens alias_clause
                    {
                        RangeSubselect *n = makeNode(RangeSubselect);
                        n->subquery = $1;
                        n->alias = $2;
                        $$ = (Node *) n;
                    }
                | joined_table
                    {
                        $$ = (Node *) $1;
                    }
                | '(' joined_table ')' alias_clause
                    {
                        $2->alias = $4;
                        $$ = (Node *) $2;
                    }
            ;
    
    
    /*
     * It may seem silly to separate joined_table from table_ref, but there is
     * method in SQL92's madness: if you don't do it this way you get reduce-
     * reduce conflicts, because it's not clear to the parser generator whether
     * to expect alias_clause after ')' or not.  For the same reason we must
     * treat 'JOIN' and 'join_type JOIN' separately, rather than allowing
     * join_type to expand to empty; if we try it, the parser generator can't
     * figure out when to reduce an empty join_type right after table_ref.
     *
     * Note that a CROSS JOIN is the same as an unqualified
     * INNER JOIN, and an INNER JOIN/ON has the same shape
     * but a qualification expression to limit membership.
     * A NATURAL JOIN implicitly matches column names between
     * tables and the shape is determined by which columns are
     * in common. We'll collect columns during the later transformations.
     */
    
    joined_table: // 连接
                '(' joined_table ')'
                    {
                        $$ = $2;
                    }
                | table_ref CROSS JOIN table_ref
                    {
                        /* CROSS JOIN is same as unqualified inner join */
                        JoinExpr *n = makeNode(JoinExpr);
                        n->jointype = JOIN_INNER;
                        n->isNatural = FALSE;
                        n->larg = $1;
                        n->rarg = $4;
                        n->usingClause = NIL;
                        n->quals = NULL;
                        $$ = n;
                    }
                | table_ref join_type JOIN table_ref join_qual
                    {
                        JoinExpr *n = makeNode(JoinExpr);
                        n->jointype = $2;
                        n->isNatural = FALSE;
                        n->larg = $1;
                        n->rarg = $4;
                        if ($5 != NULL && IsA($5, List))
                            n->usingClause = (List *) $5; /* USING clause */
                        else
                            n->quals = $5; /* ON clause */
                        $$ = n;
                    }
                | table_ref JOIN table_ref join_qual
                    {
                        /* letting join_type reduce to empty doesn't work */
                        JoinExpr *n = makeNode(JoinExpr);
                        n->jointype = JOIN_INNER;
                        n->isNatural = FALSE;
                        n->larg = $1;
                        n->rarg = $3;
                        if ($4 != NULL && IsA($4, List))
                            n->usingClause = (List *) $4; /* USING clause */
                        else
                            n->quals = $4; /* ON clause */
                        $$ = n;
                    }
                | table_ref NATURAL join_type JOIN table_ref
                    {
                        JoinExpr *n = makeNode(JoinExpr);
                        n->jointype = $3;
                        n->isNatural = TRUE;
                        n->larg = $1;
                        n->rarg = $5;
                        n->usingClause = NIL; /* figure out which columns later... */
                        n->quals = NULL; /* fill later */
                        $$ = n;
                    }
                | table_ref NATURAL JOIN table_ref
                    {
                        /* letting join_type reduce to empty doesn't work */
                        JoinExpr *n = makeNode(JoinExpr);
                        n->jointype = JOIN_INNER;
                        n->isNatural = TRUE;
                        n->larg = $1;
                        n->rarg = $4;
                        n->usingClause = NIL; /* figure out which columns later... */
                        n->quals = NULL; /* fill later */
                        $$ = n;
                    }
            ;
    
    alias_clause: // 别名
                AS ColId '(' name_list ')'
                    {
                        $$ = makeNode(Alias);
                        $$->aliasname = $2;
                        $$->colnames = $4;
                    }
                | AS ColId
                    {
                        $$ = makeNode(Alias);
                        $$->aliasname = $2;
                    }
                | ColId '(' name_list ')'
                    {
                        $$ = makeNode(Alias);
                        $$->aliasname = $1;
                        $$->colnames = $3;
                    }
                | ColId
                    {
                        $$ = makeNode(Alias);
                        $$->aliasname = $1;
                    }
            ;
    
    opt_alias_clause: alias_clause      { $$ = $1; }
                | /*EMPTY*/ { $$ = NULL; }
            ;
    
    join_type:  FULL join_outer                         { $$ = JOIN_FULL; }
                | LEFT join_outer                       { $$ = JOIN_LEFT; }
                | RIGHT join_outer                      { $$ = JOIN_RIGHT; }
                | INNER_P                               { $$ = JOIN_INNER; }
            ;
    
    /* OUTER is just noise... */
    join_outer: OUTER_P                                 { $$ = NULL; }
                | /*EMPTY*/                             { $$ = NULL; }
            ;
    
    /* JOIN qualification clauses
     * Possibilities are:
     *  USING ( column list ) allows only unqualified column names,
     *                        which must match between tables.
     *  ON expr allows more general qualifications.
     *
     * We return USING as a List node, while an ON-expr will not be a List.
     */
    
    join_qual:  USING '(' name_list ')'                 { $$ = (Node *) $3; }
                | ON a_expr                             { $$ = $2; }
            ;
    
    
    relation_expr:
                qualified_name
                    {
                        /* default inheritance */
                        $$ = $1;
                        $$->inhOpt = INH_DEFAULT;
                        $$->alias = NULL;
                    }
                | qualified_name '*'
                    {
                        /* inheritance query */
                        $$ = $1;
                        $$->inhOpt = INH_YES;
                        $$->alias = NULL;
                    }
                | ONLY qualified_name
                    {
                        /* no inheritance */
                        $$ = $2;
                        $$->inhOpt = INH_NO;
                        $$->alias = NULL;
                    }
                | ONLY '(' qualified_name ')'
                    {
                        /* no inheritance, SQL99-style syntax */
                        $$ = $3;
                        $$->inhOpt = INH_NO;
                        $$->alias = NULL;
                    }
            ;
    
    
    relation_expr_list:
                relation_expr                           { $$ = list_make1($1); }
                | relation_expr_list ',' relation_expr  { $$ = lappend($1, $3); }
            ;
    
    ...
    where_clause: // where子句
                WHERE a_expr                            { $$ = $2; }
                | /*EMPTY*/                             { $$ = NULL; }
            ;
    ...
    

    程序段

    ...
    /* parser_init()
     * Initialize to parse one query string
     */
    void
    parser_init(base_yy_extra_type *yyext)
    {
        yyext->parsetree = NIL;     /* in case grammar forgets to set it */
        yyext->core_yy_extra.query_string_locationlist = NIL;
        yyext->core_yy_extra.paren_depth = 0;
    }
    ...
    
    • 词法语法解析流程
      这里用以下查询语句进行分析
    postgres=# select * from a where id < 100 order by id;
    

    语句执行流程图如下:


    parse
    1. 词法语法解析入口函数raw_parser,调用base_yyparse开始解析
    2. 首先词法解析到SELECT关键字
    simple_select:
                SELECT hint_string opt_distinct target_list
                into_clause from_clause where_clause
                group_clause having_clause window_clause
                    {
                        SelectStmt *n = makeNode(SelectStmt);
                        n->distinctClause = $3;
                        n->targetList = $4;
                        n->intoClause = $5;
                        n->fromClause = $6;
                        n->whereClause = $7;
                        n->groupClause = $8;
                        n->havingClause = $9;
                        n->windowClause = $10;
                        n->hintState = create_hintstate($2);
                        n->hasPlus = getOperatorPlusFlag();
                        $$ = (Node *)n;
                    }
    

    (1) 由SELECT关键字匹配到simple_select语法规则
    (2) hint_string, opt_distinct 返回空
    (3) target_list匹配到 '*' 字符,构建ColumnRef,加入到list
    (4) into_clause 返回空
    (5) 匹配FROM关键字,匹配表名,构建RangeVar,加入到list
    (6) 匹配WHERE关键字,匹配字段名,构建ColumnRef,匹配int常量,匹配<表达式,构建A_Expr
    (7) group_clause,having_clause,window_clause 返回空
    (8) 最后构建 SelectStmt

    1. 匹配order by
    sort_clause:
                ORDER BY sortby_list                    { $$ = $3; }
            ;
    sortby_list:
                sortby                                  { $$ = list_make1($1); }
                | sortby_list ',' sortby                { $$ = lappend($1, $3); }
            ;
    
    sortby:     a_expr USING qual_all_Op opt_nulls_order
                    {
                        $$ = makeNode(SortBy);
                        $$->node = $1;
                        $$->sortby_dir = SORTBY_USING;
                        $$->sortby_nulls = (SortByNulls)$4;
                        $$->useOp = $3;
                        $$->location = @3;
                    }
                | a_expr opt_asc_desc opt_nulls_order
                    {
                        $$ = makeNode(SortBy);
                        $$->node = $1;
                        $$->sortby_dir = (SortByDir)$2;
                        $$->sortby_nulls = (SortByNulls)$3;
                        $$->useOp = NIL;
                        $$->location = -1;      /* no operator */
                    }
    ...
    select_no_parens:
                simple_select                       { $$ = $1; }
                | select_clause sort_clause
                    {
                        insertSelectOptions((SelectStmt *) $1, $2, NIL,
                                            NULL, NULL, NULL,
                                            yyscanner);
                        $$ = $1;
                    }
    

    (1) 读取到ORDER, BY关键字,匹配sort_clause语法规则
    (2) 匹配字段名,构建ColumnRef,构建SortBy节点
    (3) 匹配 select_clause sort_clause 规则,将sort_clause中构建的SortBy节点加入到上一步的SelectStmt中

    1. 返回抽象语法树
    stmtblock:  stmtmulti
                {
                    pg_yyget_extra(yyscanner)->parsetree = $1;
                }
            ;
    
    /* the thrashing around here is to discard "empty" statements... */
    stmtmulti:  stmtmulti ';' stmt
                    {
                        if ($3 != NULL)
                        {
                            if (IsA($3, List))
                            {
                                $$ = list_concat($1, (List*)$3);
                            }
                            else
                            {
                            $$ = lappend($1, $3);
                            }
                        }
                        else
                            $$ = $1;
                    }
                | stmt
                    {
                        if ($1 != NULL)
                        {
                            if (IsA($1, List))
                            {
                                $$ = (List*)$1;
                            }
                            else
                            {
                            $$ = list_make1($1);
                            }
                        }
                        else
                            $$ = NIL;
                    }
            ;
    

    (1) 将上述SelectStmt加入list,赋值给yyextra.parsetree
    (2) raw_parser函数将parsetree返回给上层调用函数

    相关文章

      网友评论

          本文标题:词法语法解析

          本文链接:https://www.haomeiwen.com/subject/ixyjmltx.html