################################################################################
### Basic syntax of R
# This is a comment, start with #
# Variable assignment
sample_string <- "Hello, World!"
sample_string = "Hello, World!" #不建议这么用
print(sample_string) # Comments can be after a statement
sample_string <- "Hello, World!"
print(Sample_string) # R is case sensitive,所以注意区分大小写
### Data structure in R
# Numerics
print(150 + 60)
is.integer(150 + 60) # is.是判断 R 默认存储格式为小数 decimal ,即使看着是整型,结果也是decimal
is.integer(150L + 60L) # 加 L 表示整数,或者后面对结果进行转换
# Try the following and see what will happened
print(2^32)
2^32 #可直接打印
as.integer(2^31) #超出范围了,as.是转换
as.integer(2^30)
is.integer(as.integer(2^30))
.Machine$integer.max #可接受的最大的整数值2147483647。注意不要强制转换为整数!可能会造成数据丢失。
.Machine$double.xmax #1.797693e+308
.Machine$double.xmin #2.225074e-308
# Type conversion
as.integer(40.75356) #40,不做四舍五入,只保留整数位
floor(40.75356) #地板
ceiling(40.75356) #天花板
150%%24 # 6 取余
150%%3 #0
24*6 # 144
# Logicals / booleans
gender_male <- TRUE
age_45up <- FALSE
gender_male == T
age_45up != F #非
print(gender_male & age_45up) #与
print(gender_male | age_45up) #或
# Characters
str1 <- 'Jennifer'
print(str1) #"Jennifer"
str2 <- "Garon"
print(str2)
str3 <- "Bedford'Stuyvesant'"
print(str3)
str4 <- paste(str1, str2)
str4 #"Jennifer Garon"
str5 <- paste(str1, str2, sep = "") # Try to change the sep
str5 #"JenniferGaron"
str5.5 <- paste(str1, str2, sep = "-")
str5.5 #"Jennifer-Garon"
str6 <- paste0(str1, str2)
str6 #"JenniferGaron"
paste(c('a', 'b'), c('c'), sep = ":") # 交乘项,"a:c" "b:c"
paste(c('x1', 'x2', 'x3'), collapse = "+") #"x1+x2+x3"
as.character(2845) #数字类型转字符串,"2845"
# Empty values and infinity values
a <- NA
a #NA,缺失值,区分于null空值
class(a) # NA 默认是bool ,输出结果是"logical"
b <- 1/0
b # Inf,即正无穷
c <- -1/0
c # -Inf,即负无穷
is.na(a)
is.na(b) #无穷不是缺失值
is.finite(a) #不是有穷的
is.finite(b) #不是有穷的
# Vectors,矢量(向量)
#c()函数用于创建向量、列表或矩阵等数据结构。
#这个函数的功能非常强大,它可以将多个由逗号分隔的参数“链接”在一起,形成一个向量、列表或矩阵等结构
vec1 <- c(1,2,3,4,5)
vec1 #1 2 3 4 5
vec2 <- 1:10
vec2 #1 2 3 4 5 6 7 8 9 10
vec3 <- seq(1, 10, 2) #1至10且差为2的序列
vec3 #1 3 5 7 9
vec4 <- c(str1, str2)
vec4 #"Jennifer" "Garon"
rep(2, 10) #复制10个2:2 2 2 2 2 2 2 2 2 2
vec5 <- c(3,4,5,6,7)
vec5
vec1+vec5 #4 6 8 10 12,向量加法
vec1*vec5 #3 8 15 24 35,向量乘法
#####
# The recycling rule *IMPORTANT
vec1 #1 2 3 4 5
vec2 #1 2 3 4 5 6 7 8 9 10
vec2*vec1 # What is the result?
#1 4 9 16 25 6 14 24 36 50,循环乘了
length(vec2) #10
length(vec1) #5
1:5*1:2
#In 1:5 * 1:2 : 长的对象长度不是短的对象长度的整倍数
#如此例中5不是2整数倍,因此向量运算前需要先做个判断
##### !!进行运算之前查询一下向量长度 !!#####
# Factors,因子
#存放名义型和有序型的分类变量
room_type <- c("Entire home", "Private room")
room_type #"Entire home" "Private room"
class(room_type) # character,字符串类型
room_type <- as.factor(room_type)
room_type # levels 代表其中每一个 unique 的 level
#Entire home Private room
#Levels: Entire home Private room
#目前还没太明白,之后看一下资料
class(room_type) # "factor"
# Matrices,矩阵
mat_num <- c(1, 2, 3, 4, 5, 6)
print(mat_num)
mat1 <- matrix(mat_num, nrow = 2) #两行
mat1
# [,1] [,2] [,3]
#[1,] 1 3 5
#[2,] 2 4 6
dim(mat1) #dimension,获取矩阵维度,[1] 2 3即两行三列
nrow(mat1) #2行
ncol(mat1) #3列
mat2 <- matrix(mat_num, ncol = 2) #两列
mat2
# [,1] [,2]
#[1,] 1 4
#[2,] 2 5
#[3,] 3 6
mat1%*%mat2 # product,积
#矩阵相乘,需要保证mat1的col数等于mat2的行数
#1*1+3*2+5*3=22 1*4+3*5+5*6=49 2*1+4*2+6*3=28 2*4+4*5+6*6=64
# [,1] [,2]
#[1,] 22 49
#[2,] 28 64
mat_num <- c(1, 2, 3, 4)
mat3 <- matrix(mat_num, nrow = 2)
mat3
# [,1] [,2]
#[1,] 1 3
#[2,] 2 4
mat4 <- t(mat3) # transpose a matrix,矩阵转置
mat4
# [,1] [,2]
#[1,] 1 2
#[2,] 3 4
mat3*mat4 # dot product,数量积,对应位置直接数乘即可
# [,1] [,2]
#[1,] 1 6
#[2,] 6 16
solve(mat3) # inverse a matrix,逆矩阵,即AB=BA=E(单位矩阵,从左上角到右下角的对角线都是1,其余都是0)
# [,1] [,2]
#[1,] -2 1.5
#[2,] 1 -0.5
# Lists,列表
# List of different types of data
airbnb_rooms <- list(room_type = c("Entire home", "Private room"), price = c(150, 60))
airbnb_rooms
#$room_type
#[1] "Entire home" "Private room"
#$price
#[1] 150 60
airbnb_rooms$room_type #只显示$room_type类型
#[1] "Entire home" "Private room"
airbnb_rooms[[1]] #只显示第一个类型
#[1] "Entire home" "Private room"
# List of a list,列表中的类型也可以是列表
airbnb_rooms1 <- list(list("Entire home", 150), list("Private room", 60))
airbnb_rooms1
# Data frames
airbnb_df <- data.frame(name = c("Skylit Midtown Castle", "BlissArtsSpace!"),
host_id = c(2845, 7356),
host_name = c("Jennifer", "Garon"),
room_type = c("Entire home", "Private room"),
price = c(150, 60),
minimum_nights = c(30, 30))
names(airbnb_df) #获取对象的名称,对于向量或列表,names()函数获取每个元素的名称
#[1] "name" "host_id" "host_name" "room_type" "price" "minimum_nights"
colnames(airbnb_df)
#[1] "name" "host_id" "host_name" "room_type" "price" "minimum_nights"
# Manipulation of data frame 对数据框的操作
new_df <- data.frame(name = "Spacious Brooklyn Duplex, Patio + Garden",
host_id = 7378,
host_name = "Rebecca",
room_type = "Entire home",
price = 275,
minimum_nights = 5)
# Add a row/rows,下面例子实际是将两数据框合并为一个结果显示
rbind(airbnb_df, new_df)
#其中结果集里第三行是new_df中的数据
airbnb_df
new_df
#查询原数据框,数据并未改变,因此只会在结果中合并
# Add a column
cbind(airbnb_df, latitude = c(40.75356, 40.68535)) #新增latitude一列,需要赋与数据条目数量相同的值
airbnb_loc <- data.frame(name = c("Skylit Midtown Castle", "BlissArtsSpace!"),
latitude = c(40.75356, 40.68535),
longtitude = c(-73.98559, -73.95512))
airbnb_loc
# name latitude longtitude
#1 Skylit Midtown Castle 40.75356 -73.98559
#2 BlissArtsSpace! 40.68535 -73.95512
airbnb_df
# name host_id host_name room_type price minimum_nights
#1 Skylit Midtown Castle 2845 Jennifer Entire home 150 30
#2 BlissArtsSpace! 7356 Garon Private room 60 30
airbnb_df <- merge(airbnb_df, airbnb_loc, by = "name") #以name为标识列合并两数据框
airbnb_df
# name host_id host_name room_type price minimum_nights latitude longtitude
#1 BlissArtsSpace! 7356 Garon Private room 60 30 40.68535 -73.95512
#2 Skylit Midtown Castle 2845 Jennifer Entire home 150 30 40.75356 -73.98559
### Programming structure / statements
# If-else
price <- 1729
if (price >= 100 & price < 200 ) {
print("Skylit Midtown Castle")
} else {
if (price >= 200) {
print("Spacious Brooklyn Duplex, Patio + Garden")
} else {
print("BlissArtsSpace!")
}
}
# Loops
i <- 5
while (i <= 10) {
print(i)
i <- i+1 #写一个条件跳出去
}
for (ii in 1:10) {
print(ii)
}
#王琪纠正 --for循环不用赋初始值
### Data and math manipulation
# Functions
# We have see examples of using a function
# Parameters and default parameters
#log(n)是自然对数,以e为底
f1 <- function(n) {
print(log(n))
}
f1(100)
f1(exp(1))
x <- exp(2)
f1 <- function() {
print(log(x))
}
f1()
# Global and local variables
f2 <- function(n = 100) {
tmp <- 0;
for(i in 1:n) {
tmp <- tmp + i
}
print(tmp)
}
f2()
#5050,输出的是局部变量
# What will be the value of tmp?
tmp <- 0
f3 <- function(n = 100) {
for(i in 1:n) {
tmp <- tmp + i
# tmp <<- tmp + i
}
}
f3()
print(tmp)
#0,输出的是全局变量
# Buit-in functions
ls() #列出工作目录中存在的所有对象的名称,在右上角environment框里也能看到
# Check documentation of a function
#即看帮助界面
?paste
??paste
### Packages
# Install a package
install.packages("foreign")
#现下载的
#WARNING: Rtools is required to build R packages but is not currently installed. Please download and install the appropriate version of Rtools before proceeding:
# https://cran.rstudio.com/bin/windows/Rtools/
# 试开URL’https://cran.rstudio.com/bin/windows/contrib/4.3/foreign_0.8-86.zip'
#Content type 'application/zip' length 277174 bytes (270 KB)
#downloaded 270 KB
#程序包‘foreign’打开成功,MD5和检查也通过
#下载的二进制程序包在
#C:\Users\Administrator\AppData\Local\Temp\RtmpEfD61v\downloaded_packages里
# Load a package
library(foreign) #加载所需的包或库
require(foreign)
library("foreign")
# Update a package
update.packages("foreign")
### Working directory
getwd()
#[1] "C:/Users/Administrator/Documents"
setwd("c:/")
### Load and save data
# csv files
airbnb_listings <- read.csv("listings.csv", stringsAsFactors = FALSE)
#文件应存放到Working directory
#文件最后要加个换行符,否则会有类似incomplete final line found by readTableHeader on xxx的报错
airbnb_listings
# # Excel files
# library(readxl)
# df <- read_excel(filename)
# # Stata files
# library(foreign)
# df <- read.dta(filename)
head(airbnb_listings) #默认是6行,注意第一行视为表头,不算做行计数
head(airbnb_listings,n=1L)
tail(airbnb_listings,n=1L)
View(airbnb_listings) #直接用二维表显示
# Indexing
# Column indexing
head(airbnb_listings$name) #如果没有此列则显示MULL
head(airbnb_listings$wwj) #显示wwj这列所有值
head(airbnb_listings["price"]) #如果没有此列则报错Error in `[.data.frame`(airbnb_listings, "price") : 选择了未定义的列
head(airbnb_listings["wwj"]) #值连同列名一起显示
head(airbnb_listings[c("name", "price")])
head(airbnb_listings[c("wq", "wwj")])
head(airbnb_listings[, 2]) #显示第二列
head(airbnb_listings[, 2],n=1L) #显示第二列第1行
# Row indexing
brooklyn_airbnb <- airbnb_listings[airbnb_listings$neighbourhood_group == "Brooklyn",]
manhattan_airbnb <- airbnb_listings[airbnb_listings$neighbourhood_group == "Manhattan",]
wwj <- airbnb_listings[airbnb_listings$wwj == 'lazy',]
wwj #返回的是以wwj=='lazy'为条件的表里的完整的记录行
head(airbnb_listings[airbnb_listings$neighbourhood_group == "Brooklyn" & airbnb_listings$price <= 150,])
head(airbnb_listings[airbnb_listings$wwj != "lazy" & airbnb_listings$wq == "cute",]) #多个条件
# Row + column indexing
head(airbnb_listings[airbnb_listings$neighbourhood_group == "Brooklyn" & airbnb_listings$price <= 150, c("name", "price", "minimum_nights")])
head(airbnb_listings[airbnb_listings$neighbourhood_group == "Brooklyn" & airbnb_listings$price <= 150, 1:5])
head(airbnb_listings[airbnb_listings$wwj == "lazy" , c("wq")]) #查符合过滤条件的wq列
head(airbnb_listings[airbnb_listings$wwj == "monkey" , 1:2]) #查符合过滤条件的1至2两列
airbnb_listings[1, 2] #查第一行第二列的值
malo <- airbnb_listings[1, 2]
# Save your R objects
save(brooklyn_airbnb, manhattan_airbnb, file = "regional_listing.Rdata")
save(malo,file = "regional_listing.malo_data")
# Load your saved R objects
load("regional_listing.malo_data")
### A sample preparation codes for projects
# rm(list=ls()) # Clear the working space
# invisible(gc()) # Manually free memory
# args <- commandArgs(trailingOnly = TRUE)
# if(length(args) > 0) { setwd(args[1]); } # Load any parameters from the command line if any
# repo <- "https://mirrors.sjtug.sjtu.edu.cn/cran/" # Select the mirror of CRAN
# packages <- c("readxl", "foreign")
# # If a package does not exist, then install the package and load the package
# # Otherwise just load the package
# for (package in packages) {
# if(!require(package, quietly = TRUE, character.only = TRUE)) {
# install.packages(package, repos = repo);
# require(package, quietly = TRUE, character.only = TRUE);
# }
# }
################################################################################
### Statistics
### Summary statistics
# Table and cross tab
table(airbnb_listings$neighbourhood_group)
table(airbnb_listings$neighbourhood_group, airbnb_listings$minimum_nights)
table(airbnb_listings$wq, airbnb_listings$wwj)
#构建列联表,统计每个因子的频数
# mean
#用于计算作为参数传递给它的数字向量元素的算术平均值
mean(airbnb_listings$price)
mean(c(airbnb_listings$price, NA))
mean(c(airbnb_listings$price, NA), na.rm = T)
airbnb_1 <- data.frame(name = c("Skylit Midtown Castle", "BlissArtsSpace!"),
host_id = c(2845, 7356),
host_name = c("Jennifer", "Garon"),
room_type = c("Entire home", "Private room"),
price = c(150, 60),
minimum_nights = c(30, 30))
airbnb_1
mean(airbnb_1$price) #210/2=105
mean(c(airbnb_1$price, NA)) #NA
mean(c(airbnb_1$price, NA), na.rm = T) #105,已忽略NA值
#mean(x, trim = 0, na.rm = FALSE, ...)
#x是数值型、逻辑向量
#trim表示截尾平均数,0~0.5之间的数值,如:0.10表示丢弃最大10%和最小的10%的数据后,再计算算术平均数。默认为0.
#rm是逻辑值,表示在计算之前,是否忽略NA的值
# standard deviation,标准差
sd(airbnb_listings$price)
# variance,方差
var(airbnb_listings$price)
# minimum
min(airbnb_listings$price)
# maximum
max(airbnb_listings$price)
# median
median(airbnb_listings$price)
# quantile,分位数
quantile(airbnb_listings$price, 0.9)
#显示统计信息
summary(airbnb_listings)
summary(airbnb_1)
################################################################################
### Functions in R
# Apply family of functions
# Compute average price of airbnb listings
apply(airbnb_listings$price, 2, mean) # Also try to change to max, min ...
#apply()函数主要功能是将所设定的函数应用到指定对象的每一行(1)或列(2)
airbnb_2 <- data.frame(host_id = c(2845, 7356),
price = c(150, 60),
minimum_nights = c(30, 30))
airbnb_2
apply(airbnb_2,2,max)
# Compute average price and minimum_nights required of aibnb listings
lapply(list(airbnb_listings$price, airbnb_listings$minimum_nights), mean)
#lapply()函数的使用方法与下面的sapply()函数几乎相同,
#但是lapply()函数的首字母l是list的缩写,表示lapply()函数所传回的是列表。
sapply(list(airbnb_listings$price, airbnb_listings$minimum_nights), mean)
#apply()函数尽管好用,但主要是用在矩阵、N为数组、数据框;
#若是面对向量,列表则可以使用sapply()(注:数据框数据也可用)
#此函数开头是s,是simplify的缩写,表示会对执行结果的对象进行简化。
# Compute average price of airbnb listings by region
tapply(airbnb_listings$price, airbnb_listings$neighbourhood_group, mean)
tapply(airbnb_2$price, airbnb_2$minimum_nights, mean) #为什么顺序是反的?
#tapply()函数主要是用于一个因子或因子列表,执行指定的函数,最后获得汇总信息。
# mapply
#3月4日看到这
###################################################
#3月5日
#还是将提供的数据导进来吧-。-
getwd()
airbnb_listings <- read.csv("listings.csv", stringsAsFactors = FALSE)
View(airbnb_listings)
# aggregate function 聚合函数
# Compute average price of airbnb listings by region
#语法aggregate(x, by, FUN, ..., simplify = TRUE, drop = TRUE)
#其中by是分组元素的列表,每个与数据框x中的变量长度相同。 使用之前,将这些元素强制转换为因子。
list(neighbourhood_group = airbnb_listings$neighbourhood_group)
#此例中取airbnb_listings的neighbourhood_group为分组列(后面加入by子句里),类比SQL中的group by标识
aggregate(airbnb_listings$price, by = list(neighbourhood_group = airbnb_listings$neighbourhood_group), mean)
#然后以上述标识分组,对组内的price取mean,类比SQL中的聚合操作
head(aggregate(airbnb_listings$price, by = list(neighbourhood_group = airbnb_listings$neighbourhood_group), mean),n=4L)
#neighbourhood_group x
#1 Bronx 104.4990
#2 Brooklyn 131.5993
#3 Manhattan 205.2784
#4 Queens 109.8711
# Use the formula specification
head(aggregate(price ~ neighbourhood_group, data = airbnb_listings, mean))
#这种方式看起来更易读
# expand.grid
#用于创建一个 DataFrame ,其中包含所有值,这些值可以通过作为参数传递给函数的所有向量或因子的组合形成
# Generate x1-x5, y1-y5 variable names
apply(expand.grid(c("x", "y"), 1:5), 1, paste, collapse="")
# [1] "x1" "y1" "x2" "y2" "x3" "y3" "x4" "y4" "x5" "y5"
apply(expand.grid(c("x", "y"), 1:5), 2, paste, collapse="")
#Var1 Var2
#"xyxyxyxyxy" "1122334455"
expand.grid(c("x", "y"), 1:5)
#Var1 Var2
#1 x 1
#2 y 1
#3 x 2
#4 y 2
#5 x 3
#6 y 3
#7 x 4
#8 y 4
#9 x 5
#10 y 5
# Sample code: generate a formula using expand.grid
#生成公式,其中Independent Variable自变量简写为indep,Dependent Variable因变量简写为dep
dep <- "y"
tb <- expand.grid(c("x", "z"), 1:5) #构造的数据框如下
#Var1 Var2
#1 x 1
#2 z 1
#3 x 2
#4 z 2
#5 x 3
#6 z 3
#7 x 4
#8 z 4
#9 x 5
#10 z 5
indep <- apply(tb, 1, paste, collapse="")
indep
# [1] "x1" "z1" "x2" "z2" "x3" "z3" "x4" "z4" "x5" "z5"
formula <- paste(dep, "~", paste0(indep, collapse = "+"))
formula
#[1] "y ~ x1+z1+x2+z2+x3+z3+x4+z4+x5+z5"
################################################################################
### Parallel computing
#首先根据此前注释中的内容自动加载包
repo <- "https://mirrors.sjtug.sjtu.edu.cn/cran/" # Select the mirror of CRAN
packages <- c("foreach", "doParallel")
for (package in packages) {
if(!require(package, quietly = TRUE, character.only = TRUE)) {
install.packages(package, repos = repo);
require(package, quietly = TRUE, character.only = TRUE);
}
}
#程序包‘iterators’打开成功,MD5和检查也通过
#程序包‘foreach’打开成功,MD5和检查也通过
#下载的二进制程序包在C:\Users\wenjie.wang\AppData\Local\Temp\RtmpK8ESC7\downloaded_packages里
#程序包‘doParallel’打开成功,MD5和检查也通过
#下载的二进制程序包在C:\Users\wenjie.wang\AppData\Local\Temp\RtmpK8ESC7\downloaded_packages里
# Caluclate the total price of a listing by mutiplying price and mimum_nights
# For loop
tic <- proc.time() #返回一个包含关于当前进程已使用的用户和系统时间的向量
tic
# 用户 系统 流逝
# 61.95 79.45 90332.90
total_price1 <- c()
total_price1 #目前是空值
nrow(airbnb_listings) #nrow()用于取表格的行数
#[1] 36724
airbnb_listings[1,"price"]*airbnb_listings[1,"minimum_nights"] #4500
head(airbnb_listings[c("price","minimum_nights")],1L) #150,30
for(i in 1:nrow(airbnb_listings)) {
# if(i %% 1000 == 0) { print(i); }
cur_total <- airbnb_listings[i,"price"]*airbnb_listings[i,"minimum_nights"] #得加引号
total_price1 <- c(total_price1, cur_total)
}
print(proc.time() - tic)
#用户 系统 流逝
#1.39 0.22 1.63
# Foreach
#foreach提供了类似for循环的函数写法,能帮助我们增加代码的可读性,同时又能做高效的并行处理
packages <- c("data.table")
for (package in packages) {
if(!require(package, quietly = TRUE, character.only = TRUE)) {
install.packages(package, repos = repo);
require(package, quietly = TRUE, character.only = TRUE);
}
}
#程序包‘data.table’打开成功,MD5和检查也通过
#下载的二进制程序包在C:\Users\wenjie.wang\AppData\Local\Temp\RtmpK8ESC7\downloaded_packages里
tic <- proc.time()
total_price2 <- foreach(i = 1:nrow(airbnb_listings), .combine = 'c', .packages = c("data.table")) %do% {
cur_total <- airbnb_listings[i, "price"]*airbnb_listings[i, "minimum_nights"] #补了引号
cur_total
}
print(proc.time() - tic)
#用户 系统 流逝
#3.64 0.09 3.75
#也没快啊,比前面的还慢了一点
all(total_price1 == total_price2) # Do the two results agree?
#[1] TRUE
# Parallel foreach
# Register the parallel backend
n_cores <- detectCores() - 1 # Get the number of cores in the computer minum one (why?) 获取主机CPU核数,并行度不能超过核数
n_cores #11
n_cores <- 4
# n_cores <- detectCores() * 0.8
cl <- makeCluster(n_cores)
cl #主机‘localhost’上11个节点的插座集群
registerDoParallel(cl) # Register the cluster
# Run foreach in parallel,并行执行
tic <- proc.time()
tic
total_price3 <- foreach(i = 1:nrow(airbnb_listings), .combine = 'c', .packages = c("data.table")) %dopar% {
cur_total <- airbnb_listings[i, "price"]*airbnb_listings[i, "minimum_nights"] #补了引号
cur_total
}
print(proc.time() - tic)
#用户 系统 流逝
#5.12 0.53 6.33
#11核并行,怎么更慢了???
#用户 系统 流逝
#4.39 0.47 5.24
#这个结果是4核运行的结果
stopCluster(cl) # Unregister the resources
# Try to remove the .packages argument in the previous statements, run again, and see what will happen
网友评论