## 分组统计
display(df %>% filter(RESP_DT == '20191102') %>% group_by(colname) %>% summarise(cnt = length(RESP_DT)) %>% arrange(cnt))
## 表连接
a<-df%>%dplyr::filter(RESP_TYPE_NM=="Mail Open")%>%dplyr::group_by(colname)%>%dplyr::summarise(email_cnt=length(unique(email_name)))
c<-df %>%dplyr::group_by(colname)%>%dplyr::summarise(email_cnt=length(unique(email_name)))
c<-df%>%dplyr::group_by(colname)%>%dplyr::summarise(email_cnt=n_distinct(email_name))
d<-c%>%dplyr::left_join(a,by="colname")%>%dplyr::left_join(b,by="colname")
### 增加字段
d<-dplyr::mutate(d,clickpercent=click/open)
###表连接
c<-c%>%dplyr::left_join(a,by="colname")%>%dplyr::left_join(b,by="colname")
x %>% left_join(y, by = c("name" = "name"))
coon1 <- merge(x =contact_phone_nbr,y = phone_nbr,by.x='phonenumber',by.y='id',all.x = T)
### 多条件匹配,交集
merge(a,b,by = intersect(names(a), names(b)))
### 两个相同的字段匹配,左连接
merge(a,b,all.x=TRUE)
## 修改日期
a$col<- lubridate::ymd(a$col)
a$DT<-lubridate::ymd(s$DT)
## 字符串截取函数
a$col<-substring(a$col,1,9)
substr(data,1,4)
## 新增加一个自增字段
a<-a%>% dplyr::mutate(ID=row_number())
## 字符串匹配函数
unique(a[grep('xxx',tolower(a$col)),]$col)
txt <- c("Dother")
txt1 <- c("otherD")
b <- df[grep(txt,df$CN),]
b1 <- df[grep(txt1,df$CN),]
## ifelse语句
phone <- phone %>% dplyr::mutate(flag = case_when(phone$raw_phone %in% blacklist_phone$Phone~ '1',
phone$raw_phone %in% blacklist_phone$cleanphone ~ '1',
phone$cleanphone %in% blacklist_phone$Phone~ '1',
phone$cleanphone %in% blacklist_phone$cleanphone ~ '1',
TRUE ~ '0'))
## 删除字段
df[,"colname"]<-NULL
## 去重
df%>% unique()
## 两个字段合并后去重计数
length(unique(paste(df$col1,df$col2)))
## 筛选出col1,col2计数超过2的字段
filter(df%>% group_by(col1,col2) %>% summarise(cnt = length(col)),cnt > 1) %>% arrange(desc(cnt))
## 去空格
df$col<- gsub(" ", "", df$col, fixed = TRUE)
df$col<- trimws(df$col,which="both")
## 拉横数据
library(reshape2)
library(dplyr)
df<- reshape2::dcast(df, col1+col2~Type, value.var = "price")
## 拉直数据
library(reshape2)
a1 <- a %>% reshape2::melt(id=c("A_NM","B_NM","C_NM") %>% dplyr::arrange(A_NM))
# 分条件加列
df$flag <- ifelse((df$rawphone %in% blacklist_phone$phone| phone$rawphone %in% blacklist_phone$cleanphone | phone$cleanphone %in% blacklist_phone$MAIN_PHN_NBR | phone$cleanphone %in% blacklist_phone$cleanphone),1,0)
# 选出最新日期(三种方法)
a1<-a %>% dplyr::group_by(col1,col2) %>% arrange(col1,col2,desc(date)) %>%dplyr::mutate(index=row_number()) %>% dplyr::filter(index==1)
a1 <- a %>% dplyr::group_by(col1,col2) %>% dplyr::arrange(QUANTITY,desc(date))%>% mutate(rn = rank(desc(date), ties.method = "first")) %>% dplyr::filter(rn==1)
a1 <- a %>% dplyr::group_by(col1,col2) %>% dplyr::arrange(QUANTITY,desc(date))%>% mutate(id=seq(1,length(paste(col1,col2)))) %>% dplyr::filter(id==1)
# 去空格大写
df$col1<-toupper(gsub(" ", "", df$col1, fixed = TRUE))
## 检查两列是否有重复
df$col<- paste(df$col1,df$col2,sep = "")
display(df%>% dplyr::group_by(col) %>% dplyr::summarise(cnt=length(col)) %>% dplyr::arrange(desc(cnt)))
## 不满足条件,终止下面cmd的运行
stopifnot( nrow(df%>% dplyr::group_by(col1) %>% dplyr::summarise(cnt=length(col2)) %>% dplyr::filter(cnt>1)) != 0)
if (nrow(df%>% dplyr::group_by(col1) %>% dplyr::summarise(cnt=length(col2)) %>% dplyr::filter(cnt>1)) != 0) {
stop("invalid productid please double check if any space or else in, and resave the file or the script will not run")
}
if model_day==view_result['end_date'].unique().max():
del view_result['end_date']
else:
raise BaseException
## 判断是否相等
identical( 2,2) / 2==2
## 不在
df%>% dplyr::filter( !(df$col1%in% df1$col1) )
## 当一个col1对应多个col2时,随机选取一条
display(aa<-c%>%dplyr::group_by(col1)%>%dplyr::mutate(cnt=length(col2)) %>% mutate(seq=row_number()) %>%dplyr::filter(seq==1) )
##生成123456789
df$col<- rownames(df)
## 非空去掉
df<- df%>% dplyr::filter(col !='')
## 每个col,随意选择两条
b1 <- df%>% dplyr::group_by(col) %>% dplyr::arrange(col1) %>% dplyr::mutate(id=seq(1,length(paste(col,col1,seq="")))) %>% dplyr::filter(id <=2)
## 指定字段重命名
colnames(df)[6] <- 'colName'
b<-df[which((df$col)=="999999999"),]
table(a$Tag,a$Tag_1)
## 以特定字符把列进行切分(分成多列)
df$col<- strsplit(df$col,"_")
sd<- data.frame(sd,p=sapply(sd$col1,function(i){i[[1]]}),q=sapply(df$col1,function(i){i[[2]]}))
## 生成系统时间的下月一号
lubridate::ymd(paste0(substr(as.character(Sys.Date() %m+% base::months(1)), 1, 8),"01"))
网友评论