美文网首页
字符串过滤 Emoji

字符串过滤 Emoji

作者: alfredking12 | 来源:发表于2018-01-03 21:03 被阅读14次

代码

  • Program
using System;
using System.Collections.Generic;
using System.Text;

namespace app
{
    class Program
    {
        static void Main(string[] args)
        {
            var str1 = "12😃3";
            var str2 = UnicodeUtil.FilterEmoji(str1, "emoji-test.txt");
            Console.WriteLine($"str1 = {str1}");
            Console.WriteLine($"str2 = {str2}");
        }
    }
}


---

输出:
str1 = 12😃3
str2 = 123
  • EmojiData (EmojiItem)
using System;
using System.Collections.Generic;
using System.IO;

namespace app {

    public class EmojiItem {
        public uint[] codePoints {get;set;}
        public string qualified {get;set;}
    }

    public class EmojiData {

        public List<EmojiItem> items {get; private set;} = new List<EmojiItem>();

        private uint[] getCodePoints(string line)
        {

            List<uint> ret = new List<uint>();

            line = line.Trim();
            var arr = line.Split(' ');

            foreach(var str in arr)
            {
                if (string.IsNullOrWhiteSpace(str))
                    continue;

                ret.Add(Convert.ToUInt32($"0x{str}", 16));
            }

            if (ret.Count > 0)
                return ret.ToArray();

            return null;
        }

        public void load(string path)
        {
            items.Clear();

            var lines = File.ReadLines(path);

            foreach(var line in lines)
            {
                var str_line = line.Split('#')[0];
                if (string.IsNullOrWhiteSpace(str_line))
                    continue;

                var arr1 = str_line.Split(';');
                var arr2 = arr1[0].Trim().Split(' ');

                var item = new EmojiItem
                {
                    codePoints = getCodePoints(arr1[0]),
                    qualified = arr1.Length > 1 ? arr1[1] : ""
                };

                if (item.codePoints == null)
                    continue;
                    
                items.Add(item);
                items = items.OrderByDescending(x => x.codePoints.Length).ToList();
            }
        }

        public EmojiItem find(uint[] codePoints, int offset)
        {
            foreach(var item in items)
            {
                if (match(item.codePoints, codePoints, offset))
                    return item;
            }

            return null;
        }

        private bool match(uint[] dest, uint[] src, int offset)
        {
            if (dest.Length > src.Length - offset)
                return false;

            for(var i=0;i<dest.Length;i++)
            {
                if (dest[i] != src[offset + i])
                    return false;
            }

            return true;
        }
    }
}
  • UnicodeUtil
using System;
using System.Collections.Generic;
using System.Text;

namespace app {
    static public class UnicodeUtil {
        #region Private

        static private EmojiData EmojiData = null;
        static private object EmojiDataLocker = new object();

        /// <summary>
        /// byte[] -> short
        /// </summary>
        /// <param name="src">little ending</param>
        /// <param name="offset"></param>
        /// <returns></returns>
        static private ushort ToShort(byte[] src, int offset)
        {
            return (ushort)((src[offset] | src[offset + 1] << 8) & 0x0000ffff);
        }

        static private byte[] FromCodePoint(uint value)
        {
            if (value < 0x10000)
            {
                byte[] ret = new byte[2];
                ret[0] = (byte)(value & 0xFF);
                ret[1] = (byte)((value >> 8) & 0xFF);
                return ret;
            }
            else
            {
                var b1 = (value & 0x3f) | 0x80;
                var b2 = ((value >> 6) & 0x3f) | 0x80;
                var b3 = ((value >> 12) & 0x3f) | 0x80;
                var b4 = ((value >> 18) & 0x0f) | 0xf0;
                
                var bytes = new byte[] { (byte)b4, (byte)b3, (byte)b2, (byte)b1 };
                return Encoding.Unicode.GetBytes(Encoding.UTF8.GetString(bytes));
            }
        }

        static private uint GetCodePoint(byte[] src, ref int offset)
        {
            uint result = 0;

            var b = src[offset + 1];
            
            if (b >= 0xD8 && b <= 0xDB)
            {
                var hs = (uint)ToShort(src, offset);
                var ls = (uint)ToShort(src, offset + 2);
                result = 0x10000 + (hs - 0xD800) * 0x400 + (ls - 0xDC00);
                offset += 4;
            }
            else
            {
                result = (uint)ToShort(src, offset);
                offset += 2;
            }

            return result;
        }

        #endregion
        

        static public uint[] GetCodePoints(String source)
        {
            var bytes = Encoding.Unicode.GetBytes(source);
            List<uint> codePoints = new List<uint>();
            int offset = 0;

            while (offset < bytes.Length)
            {
                codePoints.Add(GetCodePoint(bytes, ref offset));
            }

            return codePoints.ToArray();
        }
        
        static public string FilterEmoji(string source, string emojiDataFile)
        {
            if (EmojiData == null)
            {
                lock(EmojiDataLocker)
                {
                    if (EmojiData == null)
                    {
                        EmojiData = new EmojiData();
                        EmojiData.load(emojiDataFile);
                    }
                }
            }

            var data = new EmojiData();
            data.load(emojiDataFile);

            List<byte> buffer = new List<byte>();
            var codePoints = GetCodePoints(source);
            int offset = 0;

            while (offset < codePoints.Length)
            {
                var item = EmojiData.find(codePoints, offset);
                if (item != null)
                {
                    offset += item.codePoints.Length;
                }
                else
                {
                    var bytes = FromCodePoint(codePoints[offset]);
                    buffer.AddRange(new List<byte>(bytes));
                    offset++;
                }
            }

            return Encoding.Unicode.GetString(buffer.ToArray());
        }
    }
}

emoji-test.txt (Emoji 5.0)

官网 http://www.unicode.org/Public/emoji/5.0/
或者
百度盘 链接: https://pan.baidu.com/s/1geJvb6V 密码: y12b

参考

https://emojipedia.org/
https://apps.timwhitlock.info/emoji/tables/unicode#block-6b-additional-transport-and-map-symbols
https://unicode.org/emoji/charts/full-emoji-list.html

相关文章

网友评论

      本文标题:字符串过滤 Emoji

      本文链接:https://www.haomeiwen.com/subject/ajcknxtx.html