美文网首页
文本文件的编码格式识别功能 (UTF-8,ANSI,UTF-16

文本文件的编码格式识别功能 (UTF-8,ANSI,UTF-16

作者: 剑舞春秋 | 来源:发表于2013-01-29 08:52 被阅读510次

    代码:

    enum ENCODETYPE
    {
        UNKNOW,
        ANSI,
        UTF8,
        UTF16
        };
    ENCODETYPE TellEncodeType(BYTE* pBuf,int bufLen)
    {
    ENCODETYPE filetype=UNKNOW;
    
    if (pBuf[0]==0xFF && pBuf[1]==0xFE
        || pBuf[0]==0xFE && pBuf[1]==0xFF)
        filetype=UTF16;
    else
    {
        int    utf8Nums=0;    //符合UTF8编码的字符个数,非Ansi部分
        int count=0;
        while(count         {
            int i=0;
            while( i             {
                if (pBuf[count+i]>0xC0)
                {
                    if (pBuf[count+i+1]<0x80 || pBuf[count+i+1]>0xC0)
                    {
                        filetype=ANSI;
                        break;
                    }
                    else
                    {
                        /*
                        The transformation table for UTF-8 is presented below:
                        UNICODE                 UTF-8
                        00000000 - 0000007F     0xxxxxxx
                        00000080 - 000007FF     110xxxxx 10xxxxxx
                        00000800 - 0000FFFF     1110xxxx 10xxxxxx 10xxxxxx                          //0xE0
                        00010000 - 001FFFFF     11110xxx 10xxxxxx 10xxxxxx 10xxxxxx                 //0xF0
                        00200000 - 03FFFFFF      111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx        //0xF8
                        04000000 - 7FFFFFFF     1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx   //0xFC
                        */
    
                        BYTE *tC;//target Char to 识别
    
                        int utfStrLen=0;
                        //是否为正确的UTF8格式
                        tC=pBuf+count+i;
    
                        if (tC[0]<=0xF0)//<4字节
                            if(tC[0]>0xE0)    utfStrLen=3;
                            else              utfStrLen=2;   
                        else if (tC[0]>=0xFC) utfStrLen=6;
                        else if(tC[0]>=0xF8)  utfStrLen=5;
                        else                  utfStrLen=4;
    
                        int k=1;
                        while(k                         {
                            if (!(tC[k] & 0x80 && !(tC[k]& 0x40) ))//前二位为10
                            {
                                filetype=ANSI;
                                break;
                            }
    
                            k++;
                        }
    
                        if (k==utfStrLen)
                            utf8Nums++;
                        if (utf8Nums==10)
                            filetype=UTF8;
                    }
                }
    
                i++;
            }//while( i
            ++count;
        }//while(count
    }//else
    
    
    return filetype;
    }
    

    调用例子

     LRESULT OnOpenFile(WORD /*wNotifyCode*/, WORD /*wID*/, HWND /*hWndCtl*/, BOOL& /*bHandled*/)
    {
        //const TCHAR szFilter[]=_T("playlist files(*.pl)\0*.pl\0");
        CFileDialog dlg(TRUE,NULL,NULL,OFN_FILEMUSTEXIST|OFN_DONTADDTORECENT,NULL,m_hWnd);
        if(dlg.DoModal()!=IDOK)return 0;
    
        LPTSTR filepath=dlg.m_ofn.lpstrFile;
    
        ENCODETYPE filetype=UNKNOW;
        BYTE * pBuf;
        FILE * pFile;
        pFile = _tfopen( filepath, _T("rb") );
        if (pFile!=NULL)
        {
            //get the file size
            fseek(pFile,0,SEEK_END);
            int filesize=ftell(pFile);
            pBuf=(BYTE*)malloc(filesize);
    
            fseek(pFile,0,SEEK_SET);
            fread(pBuf,1,filesize,pFile);
    
            filetype=TellEncodeType(pBuf,filesize);
            fclose (pFile);
        }
        return 0;
    }

    相关文章

      网友评论

          本文标题:文本文件的编码格式识别功能 (UTF-8,ANSI,UTF-16

          本文链接:https://www.haomeiwen.com/subject/zuqttttx.html