我的R language备忘录

只是稍微记录了一下我在R中常用到的一些简单语法

Select by conditions

1
2
# select * from df where gender=='male' AND age==20
df[df[,'gender']=='male' & df[,'age']==20,]

rename

1
names(d)[names(d)=="beta"] <- "two"

select column by var

1
df[,'age']

select rows by var(subset)

1
subset(df, `$`(df , "fid_6")==1)

remove col

1
X <- iris[,-which(colnames(iris) == "Species")]

basic

1
2
3
4
str()
class()
mode()
attributes()

kmeans

1
2
3
4
5
6
7
8
cl<-kmeans(x,4)
par(mar=c(10,8,8,4)+0.1)
cl$size
cl$withinss
cl$betweenss
points(cl$centers, col = 1:2, pch = 8, cex = 0.5)
plot(x, col = cl$cluster)
legend("topright", inset=.05, title="Number of Cylinders", c("4","6","8"), fill=terrain.colors(3), horiz=TRUE)

Load data

1
mydata <- read.table('/Users/marvin/Desktop/tmp/etl2_wow_1w.csv', header=TRUE, sep=',', row.names='userid')

Add Col

1
ne<-data.frame(x,cl$cluster)

Remove Col

1
df2 <- df[!names(df) %in% c("X2","work_duration", "idle_duration", "night_duration")]

Draw multi bar

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
mms <- aggregate(df_aggr, by=list(df_aggr$cluster), FUN="mean")
colors <- c('darkolivegreen', 'darkorchid', 'deeppink', 'gold', 'dodgeblue', 'cyan', 'aliceblue')
mms
str(mms)
help(barplot)
barplot(mms$qq_age,
main="hi",
names.arg=mms$Group.1,
horiz = T,
cex.names=1,
width=0.2,
col=c('#fffeee','#33eeee','#ffb90f','#bb33ee','#ff5050','#66ee22','#6699ee')
)
draw_multi_means<-function(df, clusters, columns)
{
colors <- c('darkolivegreen', 'darkorchid', 'deeppink', 'gold', 'dodgeblue', 'cyan', 'aliceblue')
mms <- aggregate(df, by=list(df$cluster), FUN="mean")
for (i_col in columns)
{
colors <- c('darkolivegreen', 'darkorchid', 'deeppink', 'gold', 'dodgeblue', 'cyan', 'aliceblue')
result <- data.frame()
barplot(mms[,i_col],
main=i_col,
names.arg=mms$Group.1,
horiz = T,
cex.names=1,
width=0.2,
col=c('#fffeee','#33eeee','#ffb90f','#bb33ee','#ff5050','#66ee22','#6699ee')
)
}
#mean(df$qq_age)
#mean(df[,i]) # IMPORTANT!!
}

Sample

1
2
sample data frame
df[sample(nrow(df), 1000), ]

Subset

1
df[df$id %in% c(1,2),]

COOL BY VAR!!!

1
d[d[,'x'] == 'c',]

Remove Null (omit)

1
na.omit(df)

draw

1
2
3
4
5
6
7
8
9
10
11
12
ggplot(data=my_data, aes(my_data$friends_count)) +
geom_histogram(col="red",
aes(fill=..count..)) + scale_fill_gradient("Count", low = "green", high = "red")
--- draw
plot(x, col = cl$cluster)
points(cl$centers, col = 1:2, pch = 8, cex = 2)
---
after kmeans:
cl$withinss
cl$betweenss

multi distribution

1
2
3
df$cat <- factor(df$flag, levels=c(0,2,3), labels=c("df","pos","neg"))
attributes(df)
densityplot(~qq_age, data=df, group=cat, auto.key=T)