美文网首页
Regex实践

Regex实践

作者: MrDecoder | 来源:发表于2023-11-01 16:31 被阅读0次

    [TOC]

    #1. 电话号码

    文本:

    • J. Doe: 248-555-1234
    • B. Smith: (313) 555-1234

    Pattern:

    • \(?[2-9]\d\d\)?[ -]?[2-9]\d\d-\d{4}
    #include <Windows.h>
    #include <regex>
    #include <iostream>
    #include <string>
    
    int main()
    {
        using namespace std;
    
        string text = "J. Doe: 248-555-1234  B. Smith: (313) 555-1234";
        regex expression("\\(?[2-9]\\d\\d\\)?[ -]?[2-9]\\d\\d-\\d{4}");
    
        smatch matches;
        string::const_iterator searchStart(text.cbegin());
        while (regex_search(searchStart, text.cend(), matches, expression))
        {
            cout << "matches for '" << text << "'\n";
            cout << "Prefix: '" << matches.prefix() << "'\n";
            for (size_t i = 0; i < matches.size(); ++i)
            {
                cout << i << ": " << matches[i] << '\n';
            }
            cout << "Suffix: '" << matches.suffix() << "\'\n\n";
            searchStart = matches.suffix().first;
        }
        return 0;
    }
    
    // matches for 'J. Doe: 248-555-1234  B. Smith: (313) 555-1234'
    // Prefix: 'J. Doe: '
    // 0: 248-555-1234
    // Suffix: '  B. Smith: (313) 555-1234'
    
    // matches for 'J. Doe: 248-555-1234  B. Smith: (313) 555-1234'
    // Prefix: '  B. Smith: '
    // 0: (313) 555-1234
    // Suffix: ''
    

    \(?匹配一个可选的左括号。接下来的[2-9]\d\d负责匹配一个3位数的区号(第1位数字只能是2到9)。\)?匹配一个可选的右括号。[ -]?匹配一个空格或连字符——这个字符也是可选的。[2-9]\d\d-\d{4}匹配电话号码的剩余部分:一个3位数的局号(第1位数字只能是2到9)、一个连字符和最后4位数字。

    #2. 邮政编码

    文本:

    • 999 1st Avenue, Bigtown, NY, 11222
    • 123 High Street, Any City, MI 48034-1234

    Pattern:

    • \d{5}(-\d{4})?
    #include <Windows.h>
    #include <regex>
    #include <iostream>
    #include <string>
    
    int main()
    {
        using namespace std;
    
        string text = "999 1st Avenue, Bigtown, NY, 11222 123 High Street, Any City, MI 48034-1234";
        regex expression("\\d{5}(-\\d{4})?");
    
        smatch matches;
        string::const_iterator searchStart(text.cbegin());
        while (regex_search(searchStart, text.cend(), matches, expression))
        {
            cout << "matches for '" << text << "'\n";
            cout << "Prefix: '" << matches.prefix() << "'\n";
            for (size_t i = 0; i < matches.size(); ++i)
            {
                cout << i << ": " << matches[i] << '\n';
            }
            cout << "Suffix: '" << matches.suffix() << "\'\n\n";
            searchStart = matches.suffix().first;
        }
    
        return 0;
    }
    
    // matches for '999 1st Avenue, Bigtown, NY, 11222 123 High Street, Any City, MI 48034-1234'
    // Prefix: '999 1st Avenue, Bigtown, NY, '
    // 0: 11222
    // 1:
    // Suffix: ' 123 High Street, Any City, MI 48034-1234'
    
    // matches for '999 1st Avenue, Bigtown, NY, 11222 123 High Street, Any City, MI 48034-1234'
    // Prefix: ' 123 High Street, Any City, MI '
    // 0: 48034-1234
    // 1: -1234
    // Suffix: ''
    

    \d{5}匹配任意5位数字,(-\d{4})?匹配一个连字符和后4位数字。因为后4位数字是可选的(通过?来表明这个子表达式最多只允许出现一次)。

    #3. IP地址

    文本:

    • localhost is 127.0.0.1.

    Pattern:

    • (((\d{1,2})|(1\d{2})|(2[0-4]\d)|(25[0-5]))\.){3}((\d{1,2})|(1\d{2})|(2[0-4]\d)|(25[0-5]))
    #include <Windows.h>
    #include <regex>
    #include <iostream>
    #include <string>
    
    int main()
    {
        using namespace std;
    
        string text = "localhost is 127.0.0.1.";
        regex expression("(((\\d{1,2})|(1\\d{2})|(2[0-4]\\d)|(25[0-5]))\\.){3}((\\d{1,2})|(1\\d{2})|(2[0-4]\\d)|(25[0-5]))");
    
        smatch matches;
        string::const_iterator searchStart(text.cbegin());
        while (regex_search(searchStart, text.cend(), matches, expression))
        {
            cout << "matches for '" << text << "'\n";
            cout << "Prefix: '" << matches.prefix() << "'\n";
            for (size_t i = 0; i < matches.size(); ++i)
            {
                cout << i << ": " << matches[i] << '\n';
            }
            cout << "Suffix: '" << matches.suffix() << "\'\n\n";
            searchStart = matches.suffix().first;
        }
    
        return 0;
    }
    
    // matches for 'localhost is 127.0.0.1.'
    // Prefix: 'localhost is '
    // 0: 127.0.0.1
    // 1: 0.
    // 2: 0
    // 3: 0
    // 4:
    // 5:
    // 6:
    // 7: 1
    // 8: 1
    // 9:
    // 10:
    // 11:
    // Suffix: '.'
    

    这个模式使用了一系列嵌套子表达式。(((\d{1,2})|(1\d{2})|(2[0-4]\d)|(25[0-5]))\.)(\d{1,2})匹配任意一位或两位数字(099);**(1\d{2})**匹配以1开头的任意三位数字(100199);(2[0-4]\d)匹配整数200249;**(25[0-5])**匹配整数250255。这几个子表达式通过|操作符结合为一个更大的子表达式(其含义是只须匹配这4个子表达式之一即可)。随后的\.用来匹配.字符,它与前4个子表达式构成的子表达式又构成了一个更大的子表达式,而接下来的{3}表明需要重复3次。最后,数值范围又重复了一次(这次省略了尾部的.)以匹配IP地址里的最后一组数字。通过把4组以.分隔的数字的取值范围都限制在0~255之间,这个模式准确无误地做到了只匹配合法的IP地址,但不匹配非法的IP地址。

    #4. URL地址

    文本:

    Pattern:

    • https?://(\w:\w@)?[-\w.]+(:\d+)?(/([\w/_.]*(\?\S+)?)?)?
    #include <Windows.h>
    #include <regex>
    #include <iostream>
    #include <string>
    
    int main()
    {
        using namespace std;
    
        string text = "http://www.forta.com/blog https://www.forta.com:80/blog/index.cfm http://www.forta.com http://localhost:8500/";
        regex expression("https?://(\\w*:\\w*@)?[-\\w.]+(:\\d+)?(/([\\w/_.]*(\\?\\S+)?)?)?");
    
        smatch matches;
        string::const_iterator searchStart(text.cbegin());
        while (regex_search(searchStart, text.cend(), matches, expression))
        {
            cout << "matches for '" << text << "'\n";
            cout << "Prefix: '" << matches.prefix() << "'\n";
            for (size_t i = 0; i < matches.size(); ++i)
            {
                cout << i << ": " << matches[i] << '\n';
            }
            cout << "Suffix: '" << matches.suffix() << "\'\n\n";
            searchStart = matches.suffix().first;
        }
    
        return 0;
    }
    
    // matches for 'http://www.forta.com/blog https://www.forta.com:80/blog/index.cfm http://www.forta.com 
    // http://localhost:8500/'
    // Prefix: ''
    // 0: http://www.forta.com/blog
    // 1:
    // 2:
    // 3: /blog
    // 4: blog
    // 5:
    // Suffix: ' https://www.forta.com:80/blog/index.cfm http://www.forta.com http://localhost:8500/'
    
    // matches for 'http://www.forta.com/blog https://www.forta.com:80/blog/index.cfm http://www.forta.com 
    // http://localhost:8500/'
    // Prefix: ' '
    // 0: https://www.forta.com:80/blog/index.cfm
    // 1:
    // 2: :80
    // 3: /blog/index.cfm
    // 4: blog/index.cfm
    // 5:
    // Suffix: ' http://www.forta.com http://localhost:8500/'
    
    // matches for 'http://www.forta.com/blog https://www.forta.com:80/blog/index.cfm http://www.forta.com 
    // http://localhost:8500/'
    // Prefix: ' '
    // 0: http://www.forta.com
    // 1:
    // 2:
    // 3:
    // 4:
    // 5:
    // Suffix: ' http://localhost:8500/'
    
    // matches for 'http://www.forta.com/blog https://www.forta.com:80/blog/index.cfm http://www.forta.com 
    // http://ben:password@www.forta.com/ http://localhost:8500/'
    // Prefix: ' '
    // 0: http://ben:password@www.forta.com/
    // 1: ben:password@
    // 2:
    // 3: /
    // 4:
    // 5:
    // Suffix: ' http://localhost:8500/'
        
    // matches for 'http://www.forta.com/blog https://www.forta.com:80/blog/index.cfm http://www.forta.com 
    // http://localhost:8500/'
    // Prefix: ' '
    // 0: http://localhost:8500/
    // 1:
    // 2: :8500
    // 3: /
    // 4:
    // 5:
    // Suffix: ''
    

    https?://后面的是(\w*:\w*@)?,它将匹配嵌在URL字符串里的用户名和口令字(用户名和口令字要用:隔开,它们的后面还跟着一个@字符)。子表达式(\?\S+)?负责匹配查询字符串。查询字符串是在URL字符串里出现在?后面的文本,这些文本是可选的。

    #5. 电子邮件地址

    文本:

    Pattern:

    • (\w+\.)*\w+@(\w+\.)+[A-Za-z]+
    #include <Windows.h>
    #include <regex>
    #include <iostream>
    #include <string>
    
    int main()
    {
        using namespace std;
    
        string text = "My name is Ben Forta, and my email address is ben@forta.com.";
        regex expression("(\\w+\\.)*\\w+@(\\w+\\.)+[A-Za-z]+");
    
        smatch matches;
        string::const_iterator searchStart(text.cbegin());
        while (regex_search(searchStart, text.cend(), matches, expression))
        {
            cout << "matches for '" << text << "'\n";
            cout << "Prefix: '" << matches.prefix() << "'\n";
            for (size_t i = 0; i < matches.size(); ++i)
            {
                cout << i << ": " << matches[i] << '\n';
            }
            cout << "Suffix: '" << matches.suffix() << "\'\n\n";
            searchStart = matches.suffix().first;
        }
    
        return 0;
    }
    
    // matches for 'My name is Ben Forta, and my email address is ben@forta.com.'
    // Prefix: 'My name is Ben Forta, and my email address is '
    // 0: ben@forta.com
    // 1:
    // 2: forta.
    // Suffix: '.'
    

    \w+\.)*\w+负责匹配电子邮件地址里的用户名部分(@之前的所有文本):(\w+\.)*匹配一些由.结束的文本的零次或多次重复出现,\w+匹配必不可少的文本(这个组合将匹配ben和ben.forta等)。接下里,@匹配@字符本身,(\w+\.)匹配一个以.结束的字符串,[A-Za-z]+匹配顶级域名(com、edu、us或uk,等等)。

    相关文章

      网友评论

          本文标题:Regex实践

          本文链接:https://www.haomeiwen.com/subject/mzhgidtx.html