代码
- Program
using System;
using System.Collections.Generic;
using System.Text;
namespace app
{
class Program
{
static void Main(string[] args)
{
var str1 = "12😃3";
var str2 = UnicodeUtil.FilterEmoji(str1, "emoji-test.txt");
Console.WriteLine($"str1 = {str1}");
Console.WriteLine($"str2 = {str2}");
}
}
}
---
输出:
str1 = 12😃3
str2 = 123
- EmojiData (EmojiItem)
using System;
using System.Collections.Generic;
using System.IO;
namespace app {
public class EmojiItem {
public uint[] codePoints {get;set;}
public string qualified {get;set;}
}
public class EmojiData {
public List<EmojiItem> items {get; private set;} = new List<EmojiItem>();
private uint[] getCodePoints(string line)
{
List<uint> ret = new List<uint>();
line = line.Trim();
var arr = line.Split(' ');
foreach(var str in arr)
{
if (string.IsNullOrWhiteSpace(str))
continue;
ret.Add(Convert.ToUInt32($"0x{str}", 16));
}
if (ret.Count > 0)
return ret.ToArray();
return null;
}
public void load(string path)
{
items.Clear();
var lines = File.ReadLines(path);
foreach(var line in lines)
{
var str_line = line.Split('#')[0];
if (string.IsNullOrWhiteSpace(str_line))
continue;
var arr1 = str_line.Split(';');
var arr2 = arr1[0].Trim().Split(' ');
var item = new EmojiItem
{
codePoints = getCodePoints(arr1[0]),
qualified = arr1.Length > 1 ? arr1[1] : ""
};
if (item.codePoints == null)
continue;
items.Add(item);
items = items.OrderByDescending(x => x.codePoints.Length).ToList();
}
}
public EmojiItem find(uint[] codePoints, int offset)
{
foreach(var item in items)
{
if (match(item.codePoints, codePoints, offset))
return item;
}
return null;
}
private bool match(uint[] dest, uint[] src, int offset)
{
if (dest.Length > src.Length - offset)
return false;
for(var i=0;i<dest.Length;i++)
{
if (dest[i] != src[offset + i])
return false;
}
return true;
}
}
}
- UnicodeUtil
using System;
using System.Collections.Generic;
using System.Text;
namespace app {
static public class UnicodeUtil {
#region Private
static private EmojiData EmojiData = null;
static private object EmojiDataLocker = new object();
/// <summary>
/// byte[] -> short
/// </summary>
/// <param name="src">little ending</param>
/// <param name="offset"></param>
/// <returns></returns>
static private ushort ToShort(byte[] src, int offset)
{
return (ushort)((src[offset] | src[offset + 1] << 8) & 0x0000ffff);
}
static private byte[] FromCodePoint(uint value)
{
if (value < 0x10000)
{
byte[] ret = new byte[2];
ret[0] = (byte)(value & 0xFF);
ret[1] = (byte)((value >> 8) & 0xFF);
return ret;
}
else
{
var b1 = (value & 0x3f) | 0x80;
var b2 = ((value >> 6) & 0x3f) | 0x80;
var b3 = ((value >> 12) & 0x3f) | 0x80;
var b4 = ((value >> 18) & 0x0f) | 0xf0;
var bytes = new byte[] { (byte)b4, (byte)b3, (byte)b2, (byte)b1 };
return Encoding.Unicode.GetBytes(Encoding.UTF8.GetString(bytes));
}
}
static private uint GetCodePoint(byte[] src, ref int offset)
{
uint result = 0;
var b = src[offset + 1];
if (b >= 0xD8 && b <= 0xDB)
{
var hs = (uint)ToShort(src, offset);
var ls = (uint)ToShort(src, offset + 2);
result = 0x10000 + (hs - 0xD800) * 0x400 + (ls - 0xDC00);
offset += 4;
}
else
{
result = (uint)ToShort(src, offset);
offset += 2;
}
return result;
}
#endregion
static public uint[] GetCodePoints(String source)
{
var bytes = Encoding.Unicode.GetBytes(source);
List<uint> codePoints = new List<uint>();
int offset = 0;
while (offset < bytes.Length)
{
codePoints.Add(GetCodePoint(bytes, ref offset));
}
return codePoints.ToArray();
}
static public string FilterEmoji(string source, string emojiDataFile)
{
if (EmojiData == null)
{
lock(EmojiDataLocker)
{
if (EmojiData == null)
{
EmojiData = new EmojiData();
EmojiData.load(emojiDataFile);
}
}
}
var data = new EmojiData();
data.load(emojiDataFile);
List<byte> buffer = new List<byte>();
var codePoints = GetCodePoints(source);
int offset = 0;
while (offset < codePoints.Length)
{
var item = EmojiData.find(codePoints, offset);
if (item != null)
{
offset += item.codePoints.Length;
}
else
{
var bytes = FromCodePoint(codePoints[offset]);
buffer.AddRange(new List<byte>(bytes));
offset++;
}
}
return Encoding.Unicode.GetString(buffer.ToArray());
}
}
}
emoji-test.txt (Emoji 5.0)
官网 http://www.unicode.org/Public/emoji/5.0/
或者
百度盘 链接: https://pan.baidu.com/s/1geJvb6V 密码: y12b
参考
https://emojipedia.org/
https://apps.timwhitlock.info/emoji/tables/unicode#block-6b-additional-transport-and-map-symbols
https://unicode.org/emoji/charts/full-emoji-list.html
网友评论