1、读入数据文件

> library(pacman)
> p_load(stringr, dplyr, dtplyr, chinese.misc)

> f <- dir_or_file("./", special = "fangfang*")
>
> df <- lapply(f, read.csv) %>% do.call(bind_rows, .) %>% 
+     purrr::map_if(is.factor, as.character) %>% 
+     as_tibble()
>
> df$time <- df$time %>% str_replace("今天", "4月9日")
> str(df)

## tibble [203 x 2] (S3: tbl_df/tbl/data.frame)
##  $ time   : chr [1:203] "4月9日 16:54 来自 iPhone XS" "4月8日 17:29 来自 iPhone XS" "4月8日 00:45 来自 360安全浏览器" "4月8日 00:37 来自 360安全浏览器" ...
##  $ content: chr [1:203] "摘：“我相信一个强大的国家不会因为一本书的出版就坍塌掉，一个自信的政府也不会因为一本书就无端地指责作家。2020年及"| __truncated__ 
+  "发个视频吧。我现在说什么都被叫骂。真是领教了网络暴力。极左势力实在厉害，而且强大！真的是像病毒一样一圈一圈地感"| __truncated__ 
+  "以为自己会很冷静，但是看到视频，还是很激动。" "转。不懂常识也没办法。我早说过，辱骂我，我懒得计较，因为用那种低级文章骂人，丢脸的是他们自己，我有什么好在乎的"| __truncated__ ...

可以看到，一共有203篇博文。

2、整理time列

2020年（今年）的日期中默认没有年份标识，但去年的有。所以需要手动加上，保持格式一致。

> dt <- df
> 
> dt$time[!str_detect(dt$time, "^2019")] <- paste0("2020 ", 
+     dt$time[!str_detect(dt$time, "^2019")])
>
> dt <- tidyr::separate(dt, time, 
+     into = c("year", "month", "day", "time", "client"), sep = " ")
> dt <- tidyr::separate(dt, time, into = c("hour", "minute"), sep = ":")
> 
> dt <- dt %>% mutate_at(.vars = c("year", "month", "day", "hour", "minute"),
+      .funs = as.integer) %>% 
+     arrange(year, month, day, hour, minute)
> 
> dt[c(1,203),]

# A tibble: 2 x 7
   year month   day  hour minute client    content                                    
  <int> <int> <int> <int>  <int> <chr>     <chr>                                      
1  2019     8    22    15     35 iPhone客户~ 转。厉害。                                 
2  2020     4     9    16     54 iPhone    摘：“我相信一个强大的国家不会因为一本书的出版就坍塌掉，一个自信的政府也不会因为一本~

时间跨度从2019年8月22日15点35分到2020年4月9日16点54分。

3、整理content列

删除博文中的无用信息，比如表情符号和“转、帮转、摘”等文字。

> # 预处理函数
> preprocessor = function(x) {
+     # 清除表情符号
+     x <- str_replace_all(x, "<.*?>", "")
+     # 清除不需要的内容
+     x <- str_replace_all(x, "[转|帮转|摘|继续补漏].*?[：。、！]", 
+         "")
+     x <- str_replace_all(x, "收起全文.*", "")
+     x <- str_replace_all(x, "L方方的微博视频.*", "")
+     return(x)
+ }
> 
> dt$content <- dt$content %>% preprocessor()

4、老师喜欢在几点钟发微博

> # 设置画图字体
> p_load(showtext)
> font_add("PingFang", regular = "PingFang Regular.ttf")
> showtext_auto()

> p_load(ggplot2)
> table(as.factor(dt$hour)) %>% as.data.frame() %>% 
+     ggplot(aes(reorder(Var1, -Freq), Freq)) + 
+     geom_bar(stat = "identity", fill = "#F8511D") + 
+     theme_bw() + labs(x = "时辰", y = "")

日常发博时间

老师喜欢在下午3-5点和半夜11-12点发微博。

5、老师跟哪些人交互较多

找出她转发或评论的最后一个人，按次数排序。

> contact <- str_match(dt$content, "@.*?:") %>% unlist
> contact[!is.na(contact)] %>% str_replace_all(":", "") %>% jiebaR::freq() %>% 
+     filter(char != "@方方") %>% arrange(-freq) %>% head(10) %>% 
+     ggplot(aes(freq, reorder(char, freq))) + geom_bar(stat = "identity", 
+     width = 0.7, fill = "#F8511D") + 
+     theme_bw() + labs(x = "次数", y = "")

老师喜欢转发或评论的用户

@静娅_ 和 @他回精神病院了比较惹眼。。

6、老师喜欢用什么终端发微博

> dt.temp <- jiebaR::freq(dt$client)
> label = paste0("：",round(dt.temp$freq / sum(dt.temp$freq) * 100,2), "%")
> 
> blank_theme <- theme_minimal()+
+   theme(
+         legend.position = "none",
+         panel.border = element_blank(),
+         axis.title.y = element_blank(),
+         axis.ticks.y = element_blank(),
+         axis.text.y = element_blank(),
+         panel.grid = element_line(color = "gray60",linetype = 2),
+         axis.text.x = element_text(size=14, face="bold")
+   )
> 
> ggplot(dt.temp,aes(factor(x=char,
+                           levels = c("HUAWEI","微博","iPhone","360安全浏览器","iPhone客户端"),
+                           labels = c(paste0(char,label))),
+                           y=freq,fill=char)) +
+   geom_bar(stat = "identity",width = 1) +
+   coord_polar(theta = "x") +
+   blank_theme +
+   labs(x="",y="")

发博终端

老师在这段期间共使用过3种手机，喜欢使用电脑刷微博，屏幕大，打字快，精神。。

7、微博内容分析

> p_load(jiebaR, wordcloud2)
> 
> user_dict <- "./dict/武汉城市精选1.txt"
> wk <- worker(user = user_dict, stop_word = "./dict/mystopwords.txt")
> 
> txt <- lapply(dt$content, segment, wk)
> vocab <- txt %>% as.character2 %>% freq()

使用最多的10个词：

> vocab %>% top_n(10) %>% 
+     ggplot(aes(freq, reorder(char, freq))) + 
+     geom_col(fill = "#F8511D", width = 0.8) + 
+     labs(x = NULL, y = NULL)

高频词

老师对“武汉”爱得很深沉，并且还非常关注此次新冠疫情。

词频分布图
文字越靠上，使用频率越高，但你能看到的所有词的使用频率均超过10次：

> vocab %>% filter(freq > 10) %>% ggplot(aes(char, freq)) + 
+     geom_jitter(alpha = 0.1, 
+     size = 2.5, width = 0.25, height = 0.25) + 
+     geom_text(aes(label = char), check_overlap = TRUE, vjust = 1.5) + 
+     theme(axis.text.y = element_blank(), 
+     axis.ticks.y = element_blank()) + labs(x = "", y = "")

老师经常使用的词语

词云图：

vocab %>% filter(freq > 10) %>% 
  wordcloud2(color = "random-dark",backgroundColor = "grey20",
             minRotation = -pi/6,maxRotation = -pi/6)

文字越大，代表出现的频率越高。太小的反正也看不清楚，就舍去了。

画个帅帅的词云图

8、词语相关性分析

从上面的高频词分析中可以看到，老师很喜欢“说”这个字，我们看看“说”喜欢和谁在一起。

> p_load(tidytext, tidyr, ggraph)
> 
> words <- tibble(id = 1:length(txt), word = txt)
> grams <- words %>% unnest() %>% 
+     unnest_tokens(words, word, token = "ngrams", n = 2)
> 
> grams.sep <- grams %>% 
+     separate(words, into = c("wd1", "wd2"), sep = " ") %>% 
+     filter(wd2 == "说") %>% count(wd1, wd2, sort = T)
> 
> grams.sep %>% unite(words, wd1, wd2, sep = " ") %>% 
+     top_n(10) %>% 
+     print()

Selecting by n
# A tibble: 14 x 2
   words       n
   <chr>   <int>
 1 朋友 说    35
 2 医生 说     8
 3 想 说       7
 4 要 说       7
 5 可以 说     6
 6 说 说       6
 7 信息 说     6
 8 地 说       5
 9 告诉 说     5
10 人 说       5
11 所 说       5
12 同学 说     5
13 我 说       5
14 专家 说     5

n表示出现次数。可以看出老师的朋友应该很多，平时应该也挺喜欢和朋友聊天的。

> grams.sep %>% ggraph(layout = "fr") + 
+     geom_edge_link(aes(edge_alpha = n), 
+         show.legend = FALSE, end_cap = circle(0.07, "inches")) + 
+     geom_node_point(color = "lightblue", size = 5) + 
+     geom_node_text(aes(label = name), 
+         vjust = 1, hjust = 1) + theme_void()