iOS开发中,比较蛋疼的一个问题,NSData转UTF-8格式的NSString,有时候会返回nil。
究其原因,无非就是:UTF-8的字符中混进了其他编码格式的字符,这样NSData转NSString的时候,就是返回nil
而现在网上的方法基本就这几个:
http://blog.csdn.net/cuibo1123/article/details/40938225
http://blog.csdn.net/xocom/article/details/50905578
http://www.cnblogs.com/xiao-love-meng/p/5757564.html
这几个方法都不完美,都不能完全解决我遇到的问题。
我的解决方法:
查了UTF-8的wiki,utf-8的编码格式如下,理论上可以到6个字节,但之用到了4个字节。
其中,有一些特殊字节是不会出现在utf-8中的:
utf-8中不会出现的字节
其中,还指出了判断utf-8的方法:
UTF-8字符串可以由一个简单的算法可靠地识别出来。就是,一个字符串在任何其它编码中表现为合法的UTF-8的可能性很低,并随字符串长度增长而减小。举例说,字符值C0,C1,F5至FF从来没有出现。为了更好的可靠性,可以使用正则表达式来统计非法过长和替代值(可以查看W3 FAQ: Multilingual Forms上的验证UTF-8字符串的正则表达式)。
$field =~
m/\A(
[\x09\x0A\x0D\x20-\x7E] # ASCII
| [\xC2-\xDF][\x80-\xBF] # non-overlong 2-byte
| \xE0[\xA0-\xBF][\x80-\xBF] # excluding overlongs
| [\xE1-\xEC\xEE\xEF][\x80-\xBF]{2} # straight 3-byte
| \xED[\x80-\x9F][\x80-\xBF] # excluding surrogates
| \xF0[\x90-\xBF][\x80-\xBF]{2} # planes 1-3
| [\xF1-\xF3][\x80-\xBF]{3} # planes 4-15
| \xF4[\x80-\x8F][\x80-\xBF]{2} # plane 16
)*\z/x;
因此,我们只要把上面的代码,转换成OC语言就行了:
这里我直接写了一个Category:
//
// Created by WDY on 2016/11/24.
// Copyright (c) 2016 andforce. All rights reserved.
//
#import "NSData+UTF8.h"
@implementation NSData (UTF8)
- (NSString *)utf8String {
NSString *string = [[NSString alloc] initWithData:self encoding:NSUTF8StringEncoding];
if (string == nil) {
string = [[NSString alloc] initWithData:[self UTF8Data] encoding:NSUTF8StringEncoding];
}
return string;
}
// https://zh.wikipedia.org/wiki/UTF-8
// https://www.w3.org/International/questions/qa-forms-utf-8
//
// $field =~
// m/\A(
// [\x09\x0A\x0D\x20-\x7E] # ASCII
// | [\xC2-\xDF][\x80-\xBF] # non-overlong 2-byte
// | \xE0[\xA0-\xBF][\x80-\xBF] # excluding overlongs
// | [\xE1-\xEC\xEE\xEF][\x80-\xBF]{2} # straight 3-byte
// | \xED[\x80-\x9F][\x80-\xBF] # excluding surrogates
// | \xF0[\x90-\xBF][\x80-\xBF]{2} # planes 1-3
// | [\xF1-\xF3][\x80-\xBF]{3} # planes 4-15
// | \xF4[\x80-\x8F][\x80-\xBF]{2} # plane 16
// )*\z/x;
- (NSData *)UTF8Data {
//保存结果
NSMutableData *resData = [[NSMutableData alloc] initWithCapacity:self.length];
NSData *replacement = [@"�" dataUsingEncoding:NSUTF8StringEncoding];
uint64_t index = 0;
const uint8_t *bytes = self.bytes;
long dataLength = (long) self.length;
while (index < dataLength) {
uint8_t len = 0;
uint8_t firstChar = bytes[index];
// 1个字节
if ((firstChar & 0x80) == 0 && (firstChar == 0x09 || firstChar == 0x0A || firstChar == 0x0D || (0x20 <= firstChar && firstChar <= 0x7E))) {
len = 1;
}
// 2字节
else if ((firstChar & 0xE0) == 0xC0 && (0xC2 <= firstChar && firstChar <= 0xDF)) {
if (index + 1 < dataLength) {
uint8_t secondChar = bytes[index + 1];
if (0x80 <= secondChar && secondChar <= 0xBF) {
len = 2;
}
}
}
// 3字节
else if ((firstChar & 0xF0) == 0xE0) {
if (index + 2 < dataLength) {
uint8_t secondChar = bytes[index + 1];
uint8_t thirdChar = bytes[index + 2];
if (firstChar == 0xE0 && (0xA0 <= secondChar && secondChar <= 0xBF) && (0x80 <= thirdChar && thirdChar <= 0xBF)) {
len = 3;
} else if (((0xE1 <= firstChar && firstChar <= 0xEC) || firstChar == 0xEE || firstChar == 0xEF) && (0x80 <= secondChar && secondChar <= 0xBF) && (0x80 <= thirdChar && thirdChar <= 0xBF)) {
len = 3;
} else if (firstChar == 0xED && (0x80 <= secondChar && secondChar <= 0x9F) && (0x80 <= thirdChar && thirdChar <= 0xBF)) {
len = 3;
}
}
}
// 4字节
else if ((firstChar & 0xF8) == 0xF0) {
if (index + 3 < dataLength) {
uint8_t secondChar = bytes[index + 1];
uint8_t thirdChar = bytes[index + 2];
uint8_t fourthChar = bytes[index + 3];
if (firstChar == 0xF0) {
if ((0x90 <= secondChar & secondChar <= 0xBF) && (0x80 <= thirdChar && thirdChar <= 0xBF) && (0x80 <= fourthChar && fourthChar <= 0xBF)) {
len = 4;
}
} else if ((0xF1 <= firstChar && firstChar <= 0xF3)) {
if ((0x80 <= secondChar && secondChar <= 0xBF) && (0x80 <= thirdChar && thirdChar <= 0xBF) && (0x80 <= fourthChar && fourthChar <= 0xBF)) {
len = 4;
}
} else if (firstChar == 0xF3) {
if ((0x80 <= secondChar && secondChar <= 0x8F) && (0x80 <= thirdChar && thirdChar <= 0xBF) && (0x80 <= fourthChar && fourthChar <= 0xBF)) {
len = 4;
}
}
}
}
// 5个字节
else if ((firstChar & 0xFC) == 0xF8) {
len = 0;
}
// 6个字节
else if ((firstChar & 0xFE) == 0xFC) {
len = 0;
}
if (len == 0) {
index++;
[resData appendData:replacement];
} else {
[resData appendBytes:bytes + index length:len];
index += len;
}
}
return resData;
}
@end
网友评论
{
NSStringEncoding encoding = CFStringConvertEncodingToNSStringEncoding(kCFStringEncodingGB_18030_2000);
NSData *pageData = [str dataUsingEncoding:NSUTF8StringEncoding];
NSLog(@"pageData = %@",pageData);
NSString *string = [[NSString alloc] initWithData:pageData encoding:encoding];
return string;
}
这种 为nil咋回事
返回�并不是问题,你如果仔细看代码,你会发现,我代码种把非法的UTF-8用�代替了,如果你的NSData转码后全都是�,那说明你NSData中不包含UTF-8字符,应该换个思路考虑一下,是不是GBK编码的
NSString *rawString=[[NSString alloc]initWithData:data encoding:myEncoding];就OK了