美文网首页
c++实现删除utf8多个字符

c++实现删除utf8多个字符

作者: 一路向后 | 来源:发表于2021-11-14 14:58 被阅读0次

    1.UTF-8.h

    #ifndef _RAQUEL_UTF8_H_
    #define _RAQUEL_UTF8_H_
    
    namespace Raquel {
        class UTF8 {
        public:
            static int getLength(const char *buf);
            static int getULength(const char *buf);
            static int getULength(const char *buf, int &pos);
            static int delByChar(const char *buf, char *out, char *rem, const char *del);
        };
    };
    
    #endif
    

    2.UTF-8.cpp

    #include <iostream>
    #include <cstring>
    #include "UTF-8.h"
    
    using namespace std;
    
    int Raquel::UTF8::getLength(const char *buf)
    {
        int i = 0;
    
        for(i=0; buf[i]; i++);
    
        return i;
    }
    
    int Raquel::UTF8::getULength(const char *buf)
    {
        char firstByte;
        int offset = 1;
        int i = 0;
        int j = 0;
        int k = 0;
        int l = 0;
    
        while(buf[i])
        {
            firstByte = buf[i];
    
            offset = 1;
     
            if(firstByte & 128)
            {
                if(firstByte & 32)
                {
                    if(firstByte & 16)
                    {
                        offset = 4;
    
                        if(buf[i+1] == 0x00)
                        {
                            k = 3;
                            l = 1;
                        }
                        else if(buf[i+2] == 0x00)
                        {
                            k = 2;
                            l = 1;
                        }
                        else if(buf[i+3] == 0x00)
                        {
                            k = 1;
                            l = 1;
                        }
                    }
                    else
                    {
                        offset = 3;
    
                        if(buf[i+1] == 0x00)
                        {
                            k = 2;
                            l = 1;
                        }
                        else if(buf[i+2] == 0x00)
                        {
                            k = 1;
                            l = 1;
                        }
                    }
                }
                else
                {
                    offset = 2;
    
                    if(buf[i+1] == 0x00)
                    {
                        k = 1;
                        l = 1;
                    }
                }
            }
    
            i += offset - k;
            j++;
        }
    
        return j-l;
    }
    
    int Raquel::UTF8::getULength(const char *buf, int &pos)
    {
        char firstByte;
        int offset = 1;
        int i = 0;
        int j = 0;
        int k = 0;
        int l = 0;
        int m = 0;
    
        while(buf[i])
        {
            firstByte = buf[i];
    
            offset = 1;
     
            if(firstByte & 128)
            {
                if(firstByte & 32)
                {
                    if(firstByte & 16)
                    {
                        offset = 4;
    
                        if(buf[i+1] == 0x00)
                        {
                            k = 3;
                            l = 1;
                            m = 1;
                        }
                        else if(buf[i+2] == 0x00)
                        {
                            k = 2;
                            l = 1;
                            m = 2;
                        }
                        else if(buf[i+3] == 0x00)
                        {
                            k = 1;
                            l = 1;
                            m = 3;
                        }
                    }
                    else
                    {
                        offset = 3;
    
                        if(buf[i+1] == 0x00)
                        {
                            k = 2;
                            l = 1;
                            m = 1;
                        }
                        else if(buf[i+2] == 0x00)
                        {
                            k = 1;
                            l = 1;
                            m = 2;
                        }
                    }
                }
                else
                {
                    offset = 2;
    
                    if(buf[i+1] == 0x00)
                    {
                        k = 1;
                        l = 1;
                        m = 1;
                    }
                }
            }
    
            i += offset - k;
            j++;
        }
    
        pos = i - m;
    
        return j-l;
    }
    
    int Raquel::UTF8::delByChar(const char *buf, char *out, char *rem, const char *del)
    {
        char firstByte[2];
        int offset[2] = {1, 1};
        int i = 0;
        int j = 0;
        int k = 0;
        int l = 0;
    
        int u = 0;
        int v = 0;
        int w = 0;
        int t = 0;
        int e = 0;
    
        int p = 0;
        int q = 0;
        int r = 0;
        int s = 0;
    
        while(buf[i])
        {
            firstByte[0] = buf[i];
    
            offset[0] = 1;
     
            if(firstByte[0] & 128)
            {
                if(firstByte[0] & 32)
                {
                    if(firstByte[0] & 16)
                    {
                        offset[0] = 4;
    
                        if(buf[i+1] == 0x00)
                        {
                            k = 3;
                            l = 1;
                        }
                        else if(buf[i+2] == 0x00)
                        {
                            k = 2;
                            l = 1;
                        }
                        else if(buf[i+3] == 0x00)
                        {
                            k = 1;
                            l = 1;
                        }
                    }
                    else
                    {
                        offset[0] = 3;
    
                        if(buf[i+1] == 0x00)
                        {
                            k = 2;
                            l = 1;
                        }
                        else if(buf[i+2] == 0x00)
                        {
                            k = 1;
                            l = 1;
                        }
                    }
                }
                else
                {
                    offset[0] = 2;
    
                    if(buf[i+1] == 0x00)
                    {
                        k = 1;
                        l = 1;
                    }
                }
            }
    
            v = offset[0] - k;
    
            p = 0;
            q = 0;
            r = 0;
            s = 0;
    
            //cout << "v: " << v << endl;
    
            while(del[p])
            {
                firstByte[1] = del[p];
                offset[1] = 1;
    
                if(firstByte[1] & 128)
                {
                    if(firstByte[1] & 32)
                    {
                        if(firstByte[1] & 16)
                        {
                            offset[1] = 4;
    
                            if(del[p+1] == 0x00)
                            {
                                r = 3;
                                s = 1;
                            }
                            else if(del[p+2] == 0x00)
                            {
                                r = 2;
                                s = 1;
                            }
                            else if(del[p+3] == 0x00)
                            {
                                r = 1;
                                s = 1;
                            }
                        }
                        else
                        {
                            offset[1] = 3;
    
                            if(del[p+1] == 0x00)
                            {
                                r = 2;
                                s = 1;
                            }
                            else if(del[p+2] == 0x00)
                            {
                                r = 1;
                                s = 1;
                            }
                        }
                    }
                    else
                    {
                        offset[1] = 2;
    
                        if(del[p+1] == 0x00)
                        {
                            r = 1;
                            s = 1;
                        }
                    }
                }
    
                //cout << "w: " << offset[1] - r << endl;
    
                if(offset[1] - r == v)
                {
                    for(t=0; t<v; t++)
                    {
                        if(del[p+t] != buf[i+t])
                        {
                            break;
                        }
                    }
    
                    //cout << "t: " << t << endl;
    
                    if(t == v)
                    {
                        e++;
                        break;
                    }
                }
    
                p += offset[1] - r;
                q++;
            }
    
            //cout << "u: " << u << endl;
            //cout << "i: " << i << endl;
    
            if(t != v)
            {
                if(k == 0)
                {
                    for(t=0; t<v; t++)
                    {
                        out[u++] = buf[i+t];
                    }
                }
                else
                {
                    w = 0;
    
                    for(t=0; t<v; t++)
                    {
                        rem[w++] = buf[i+t];
                    }
    
                    rem[w] = 0x00;
                }
            }
    
            i += v;
            j++;
        }
    
        out[u] = 0x00;
    
        return e;
    }
    
    int main()
    {
        char buf[64] = "你4μ\U00010102";
        char out[64] = "你";
        char rem[5] = {0};
        int len = 0;
    
        //buf[14] = 0x00;
    
        cout << string(buf) << endl;
        cout << Raquel::UTF8::delByChar(buf, out, rem, "啊\U00010102你") << endl;
        cout << string(out) << endl;
    
        //printf("%d %d\n", buf[12], buf[13]);
        //printf("%d %d\n", rem[0], rem[1]);
    
        return 0;
    }
    

    3.编译源码

    $ g++ -o UTF-8 UTF-8.cpp -std=c++11
    

    4.运行及其结果

    $ ./UTF-8
    你4μ𐄂
    2
    4μ
    

    相关文章

      网友评论

          本文标题:c++实现删除utf8多个字符

          本文链接:https://www.haomeiwen.com/subject/upfxtrtx.html