美文网首页
联系方式识别(php版本)

联系方式识别(php版本)

作者: Rohn | 来源:发表于2018-04-28 17:05 被阅读18次
 <?php
/**
 * @name   联系方式识别
 * @name   识别出来的数字最长为20位,多组满足需求的数字也都提取出来
 * @author Rohn(253133755@qq.com)
 * @date   2018/4/25
 */
class ContactRecognize
{

    //识别文本
    private $_arr = array();
    //符合条件的数字组合
    private $_box = array();
    //栈数组
    private $_shed = array();

    private $_variation_num = array(
        '①' => 1,
        '②' => 2,
        '③' => 3,
        '④' => 4,
        '⑤' => 5,
        '⑥' => 6,
        '⑦' => 7,
        '⑧' => 8,
        '⑨' => 9,
        '㈠' => 1,
        '㈡' => 2,
        '㈢' => 3,
        '㈣' => 4,
        '㈤' => 5,
        '㈥' => 6,
        '㈦' => 7,
        '㈧' => 8,
        '㈨' => 9,
        '⑴' => 1,
        '⑵' => 2,
        '⑶' => 3,
        '⑷' => 4,
        '⑸' => 5,
        '⑹' => 6,
        '⑺' => 7,
        '⑻' => 8,
        '⑼' => 9,
        'Ⅰ' => 1,
        'Ⅱ' => 2,
        'Ⅲ' => 3,
        'Ⅳ' => 4,
        'Ⅴ' => 5,
        'Ⅵ' => 6,
        'Ⅶ' => 7,
        'Ⅷ' => 8,
        'Ⅸ' => 9,
        //简体中文
        '一' => 1,
        '二' => 2,
        '三' => 3,
        '四' => 4,
        '五' => 5,
        '六' => 6,
        '七' => 7,
        '八' => 8,
        '九' => 9,
        '久' => 9,
        //繁体中文
        '零' => 0,
        '壹' => 1,
        '贰' => 2,
        '叁' => 3,
        '肆' => 4,
        '伍' => 5,
        '陆' => 6,
        '柒' => 7,
        '捌' => 8,
        '玖' => 9,
        //字母
        'o' => 0,
        'O' => 0,
        'l' => 1,
        'I' => 1,
    );

    //最大匹配的数字长度
    const MAX_NUMBER_LENGTH = 6;
    //状态重置的标记
    const FLAG_RESET = 'reset';

    /**
     * ContactRecognize constructor.
     * @param $str
     */
    public function __construct($str){

        $this->_arr = $this->_ch2arr($str);
    }

    /**
     * 识别主体
     * @return mixed
     */
    public function recognize(){

        foreach($this->_arr as $char){

            $number = $this->_formatChar($char);
            //干扰字符,忽略
            if($number === false){
                continue;
            }
            switch($curState){
                case 0:
                    if($number != self::FLAG_RESET){
                        $curState = $this->_moveState($number, $curState);
                    }
                    break;
                case 1:
                case 2:
                case 3:
                case 4:
                    $curState = $this->_setState($number, $curState);
                    break;
                case 5:
                case 6:
                case 7:
                case 8:
                case 9:
                case 10:
                case 11:
                case 12:
                case 13:
                case 14:
                case 15:
                case 16:
                case 17:
                case 18:
                case 19:
                    $curState = $this->_setState($number, $curState);
                    break;
                default:
                    //超过20位的不再做检查,直接做判定处理
                    if(!$this->_isExempt()){
                        array_push($this->_box, $this->_shed);
                    }
                    break;
            }
        }
        //结束检查一次,是否可以把最后一组数据放入box中
        $this->_intoBox($curState);
        if(count($this->_box) > 0){
            //return implode('', $this->_shed);
            return json_encode($this->_box);
        }

        return false;
    }

    /**
     * 是否豁免
     * a)豁免重复数字,如555555,6666666666
     */
    private function _isExempt(){

        if($this->_isAllRepeat()){
            return true;
        }
    }

    /**
     * 豁免重复数字,如555555,6666666666
     */
    private function _isAllRepeat(){

        return count(array_count_values(array_slice($this->_shed, -self::MAX_NUMBER_LENGTH))) == 1;
    }

    /**
     * 设置状态位与数字盒子
     * @param $number
     * @param $curState
     * @return int
     */
    private function _setState($number, $curState){

        if($number == self::FLAG_RESET){

            $this->_intoBox($curState);
            $curState = $this->_resetState();
        }else{

            $curState = $this->_moveState($number, $curState);
        }

        return $curState;
    }

    /**
     * 重置之前检查是否是全重复,满足条件加入到box中
     * @param $curState
     */
    private function _intoBox($curState){

        if($curState >= self::MAX_NUMBER_LENGTH){
            if(!$this->_isExempt()){
                array_push($this->_box, $this->_shed);
            }
        }
    }

    /**
     * 状态前移
     * @param $number
     * @param $curState
     * @return mixed
     */
    private function _moveState($number, $curState){

        array_push($this->_shed, $number);
        $curState++;

        return $curState;
    }

    /**
     * 归初始位
     * @return int
     */
    private function _resetState(){

        $this->_shed = array();
        $curState    = 0;

        return $curState;
    }

    /**
     * 字符格式化
     * @param $char
     * @return
     *  number 数字
     *  FLAG_RESET 重置
     *  false 字符豁免忽略
     */
    private function _formatChar($char){

        //普通数字
        if(is_numeric($char)){
            return $char;
        }
        //变种数字
        $rs = $this->_isVariation($char);
        if($rs !== false){
            return $rs;
        }
        //状态重置
        $rs = $this->_isRest($char);
        if($rs !== false){
            return self::FLAG_RESET;
        }

        return false;
    }

    /**
     * 包含是中文、英文大小写重置
     * @param $char
     * @return bool
     */
    private function _isRest($char){

        //英文
        if(preg_match("/[a-zA-Z\s]/", $char)){
            return true;
        }
        //中文
        if(preg_match('/[\x{4e00}-\x{9fa5}]/u', $char) > 0){
            return true;
        }

        return false;
    }

    /**
     * 是否是变种数字
     * @param $char
     * @return bool|mixed
     */
    private function _isVariation($char){

        return isset($this->_variation_num[$char])?$this->_variation_num[$char]:false;
    }

    /**
     * 汉字转字符串
     * @param $str
     * @param string $charset
     * @return array
     */
    private function _ch2arr($str, $charset = 'utf-8'){

        $length = mb_strlen($str, $charset);
        $array  = array();
        for($i = 0; $i < $length; $i++){
            $array[] = mb_substr($str, $i, 1, $charset);
        }

        return $array;
    }
}

//测试
$s = '12资源12 零3456哈哈12Ⅶ 34567@a1234567890O00o001';
$obj = new ContactRecognize($s);
$s   = $obj->recognize();
print_r($s);

相关文章

网友评论

      本文标题:联系方式识别(php版本)

      本文链接:https://www.haomeiwen.com/subject/bdaglftx.html