19-R+Python处理MEM班同学微信签名

作者: wonphen | 来源:发表于2019-12-27 19:49 被阅读0次

    1 准备工作

    • 备注微信好友真实姓名。
    • 通过Python的itchat库爬取所有微信好友信息。代码如下:
    #-*- coding:utf-8 -*-
    
    """
    -----------------------------------
    版本:Python3.6.1
    -----------------------------------
    """
    import io
    # 导入itchat包
    import itchat
    def main():
        # 使用二维码登陆(括号内参数可以保持一段时间登录状态)
        itchat.auto_login(hotReload=True)
        # 打印出好友列表
        friends = itchat.get_friends(update=True)
        # print(friends)
        with io.open('C:/Users/Admin/Downloads/friend.txt', 'a', encoding='utf-8') as f:
            f.write(str(friends))
    
    if __name__ == '__main__':
        main()
    

    2 文本整理

    2.1 读取文本

    library(pacman)
    p_load(data.table,tidyr,stringr,dplyr)
    df <- fread('C:/Users/Admin/Downloads/friend.txt',header = F,
                stringsAsFactors = F,encoding = "UTF-8",sep = "{")
    # 查看数据框结构
    str(df)
    
    # 去掉第1列和第2列,第1列没有数据,第2列是自己,信息格式与其他列不一致
    df <- df[,3:692]
    

    2.2 将有用信息清洗成原始素材

    # 行列转置,宽转长
    df2 <- gather(df,key="id",value="info")
    
    # 将value列分裂为多列
    df3 <- separate(df2,info,sep=",",into = c('Uin','UserName','NickName','HeadImgUrl','ContactFlag',
                                              'MemberCount','MemberList','RemarkName','HideInputBarFlag','Sex',
                                              'Signature','VerifyFlag','OwnerUin','PYInitial','PYQuanPin',
                                              'RemarkPYInitial','RemarkPYQuanPin','StarFriend','AppAccountFlag','Statues',
                                              'AttrStatus','Province','City','Alias','SnsFlag',
                                              'UniFriend','DisplayName','ChatRoomId','KeyWord','EncryChatRoomId','IsOwner'))
    
    # 选择需要的列
    need <- c('id','NickName','RemarkName','Sex','Signature','Province','City')
    df4 <- df3 %>% select(need)
    
    # 选取每列中的有效信息
    df4$NickName <- df4$NickName %>% str_sub(.,start = 15L,end = -2L)
    df4$RemarkName <- df4$RemarkName %>% str_sub(.,start = 17L,end = -2L)
    df4$Sex <- df4$Sex %>% str_sub(.,start = 10L,end = -2L)
    df4$Signature <-df4$Signature %>% str_sub(.,start = 16L,end = -2L)
    df4$Province <-df4$Province %>% str_sub(.,start = 15L,end = -2L)
    df4$City <-df4$City %>% str_sub(.,start = 11L,end = -2L)
    
    # 将初步清洗的文本写入文件
    write.csv(df4,"C:/Users/Admin/Downloads/friend.zl.csv")
    

    2.3 整理MEM班级同学签名

    # 读取文件
    df5 <- read.csv("C:/Users/Admin/Downloads/friend.zl.csv",header = T,stringsAsFactors = F)
    
    # 如果RemarkName为空,用NickName替代
    ifelse(df5$RemarkName == "",df5$NickName,df5$RemarkName)
    
    # 检测一下RemarkName是否还有缺失值
    n <- sum(is.na(df5$RemarkName)); n
    
    ## [1] 0
    
    # 读取班级人员名单
    df.name <- read.csv("./mem.names.csv",header = T,stringsAsFactors = F)
    
    # 选取班级人员的签名
    df6 <- left_join(df.name,df5,by = c(`姓名` = "RemarkName"))
    
    # 选取微信签名,去掉无意义的字符,并拼接为一个文本
    txt <- df6$Signature %>% str_trim(.) %>% paste(.,collapse = " ") %>% str_remove_all(.,"NA|16L");txt
    
    ## [1] "不以物喜,不以己悲。   Maybe the fault does not lie in the way but in the choice.  
    一往无前虎山行 尽头在哪儿呢?在学习中!! 一手烂牌是运气,打好烂牌是本事。   
    ∞ The princess is so cool 一枚新时代宝藏硬核女战士  阿耨多罗三藐三菩提心  Talk is chea   
    淡泊明志  寧靜致遠  人生真味-淡,人生风度-忘。  哪些是科学,哪些是魔术,哪些是信仰  
    我的征途是星辰大海  君子坦荡荡~~ 人生绝非一场消遣。 学习是一种信仰 stay hungry stay foolish  
    能在艰苦中成长更需要一份坚决的魄力   "
    

    3 分词

    p_load(jiebaR)
    
    # 新建分词引擎,加载停用词词典,去除“的得地数字标点符号等”无意义的停用词
    wk <- worker(stop_word = "./dict/characters-master/stop_words")
    # 使用默认引擎分词
    txt1 <- segment(txt,wk);txt1
    
    ##  [1] "不以"     "物喜"     "不以己"   "悲"       "Maybe"    "fault"    "lie"      "choice"   "一往无前" "虎山行"   "尽头"     "学习"     "中"       "一手"     "烂牌"    
    ## [16] "运气"     "好烂"     "牌"       "本事"     "The"      "princess" "cool"     "一枚"     "新"       "时代"     "宝藏"     "硬核"     "女战士"   "耨"       "多罗"    
    ## [31] "三"       "藐三"     "菩提"     "心"       "Talk"     "chea"     "淡泊明志" "寧靜致遠" "人生"     "真味"     "淡"       "人生"     "风度"     "忘"       "科学"    
    ## [46] "魔术"     "信仰"     "征途"     "星辰"     "大海"     "君子"     "坦荡荡"   "人生"     "一场"     "消遣"     "学习"     "一种"     "信仰"     "stay"     "hungry"  
    ## [61] "stay"     "foolish"  "艰苦"     "中"       "成长"     "更"       "一份"     "魄力"
    

    可以看到,有些词分得有问题,手动将它们加进去。

    # 添加新词到分词器,重新分词
    new_user_word(wk,"不以物喜")
    new_user_word(wk,"不以己悲")
    new_user_word(wk,"烂牌")
    new_user_word(wk,"阿耨多罗三藐三菩提心")
    

    重新分词:

    # 重新分词
    txt2 <- segment(txt,wk);txt2
    
    ##  [1] "不以物喜"             "不以己悲"             "Maybe"                "fault"                "lie"                  "choice"               "一往无前"            
    ##  [8] "虎山行"               "尽头"                 "学习"                 "中"                   "一手"                 "烂牌"                 "运气"                
    ## [15] "打好"                 "烂牌"                 "本事"                 "The"                  "princess"             "cool"                 "一枚"                
    ## [22] "新"                   "时代"                 "宝藏"                 "硬核"                 "女战士"               "阿耨多罗三藐三菩提心" "Talk"                
    ## [29] "chea"                 "淡泊明志"             "寧靜致遠"             "人生"                 "真味"                 "淡"                   "人生"                
    ## [36] "风度"                 "忘"                   "科学"                 "魔术"                 "信仰"                 "征途"                 "星辰"                
    ## [43] "大海"                 "君子"                 "坦荡荡"               "人生"                 "一场"                 "消遣"                 "学习"                
    ## [50] "一种"                 "信仰"                 "stay"                 "hungry"               "stay"                 "foolish"              "艰苦"                
    ## [57] "中"                   "成长"                 "更"                   "一份"                 "魄力"
    

    统计词频:

    freq <- freq(txt2)
    

    4 词云图

    p_load(wordcloud2)
    
    # 图一
    wordcloud2(freq, size = 0.5, fontFamily = "微软雅黑",
               color = "random-light", backgroundColor = "grey")
    
    图一
    # 图二
    wordcloud2(freq, size = 0.5, minRotation = -pi/2, maxRotation = -pi/2)
    
    图二
    # 图三
    wordcloud2(freq, size = 0.5, minRotation = -pi/6, maxRotation = -pi/6,
      rotateRatio = 1)
    
    图三
    # 图四
    #无法在自定义图片中绘制词云,是wordcloud2最新版本的BUG,暂时还没有解决办法,推荐卸载最新版本,安装0.2.0的旧版本,具体步骤如下:
    #1、卸载现有的wordcloud2包:remove.packages("wordcloud2");
    #2、下载旧版本的wordcloud2包;
    #下载地址:Index of /src/contrib/Archive/wordcloud23、手动安装tar.gz文件
    #图片放到该文件夹下:C:\Users\Admin\Documents\R\win-library\3.6\wordcloud2\examples(前面替换为R安装路径)
    tu = system.file("examples/t.png",package = "wordcloud2")
    wordcloud2(freq, figPath = tu, size = 0.5,color = "black")
    
    图四
    # 图五
    # 因为词太少,所以选简单一点的图片和文字
    letterCloud(freq, word = "王", wordSize = 0.5,color = 'random-dark',backgroundColor = "snow")
    

    偶尔能画出来,但现在画不出来😄
    图四和图五的功能还存在BUG。

    相关文章

      网友评论

        本文标题:19-R+Python处理MEM班同学微信签名

        本文链接:https://www.haomeiwen.com/subject/qgadoctx.html