-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathProject_w11.R
94 lines (57 loc) · 2.35 KB
/
Project_w11.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
# Data Merging week 11 homework #
# What to do !!!! #
## Merge All files into data frame
## Save data about whehter the paitent is cancer or not
#Environment Setting
setwd("/home/21500576")
# Merging files into one dataframe
multmerge = function(mypath){
filenames=list.files(path=mypath, full.names=TRUE)
datalist = lapply(filenames, function(x){read.table(file=x,header=T)})
check<-lapply(datalist,dim)
index<-sapply(check,function(x){x[1]==54675&&x[2]>3})
datalist<-datalist[index]
filelist<-filenames[index]
cancer_or_not<-sapply(filelist,function(x){as.integer(substr(x,nchar(x)-4,nchar(x)-4))})
num_of_patients<-sapply(datalist,function(x){dim(x)[2]-3})
result<-mapply(function(x,y){rep(x,y)},cancer_or_not,num_of_patients)
result<-unlist(result,use.names = FALSE)
data<-Reduce(function(x,y){y<-y[c(-1,-2,-3)]; cbind(x,y)}, datalist)
return(list(Data = data, Result = result))
}
datas<-multmerge("/home/data/GEO_GPL570")
#datas<-multmerge("/Users/chanhee/Desktop/Data")
df<-datas$Data
result<-datas$Result
# Data Preprocessing week 12 homework #
# What to do !!!! #
## Genes with no names, Genes that are duplicated
## Remove row1, row3, and then transpose the data
## Add result to the dataframe
## Explore the Data!
## Remove Na Data
## Set the data type correctly for result and gene data
## Normalize the data
p_df<-df[df$Gene_Symbol!="",]
p_df<-p_df[!duplicated(p_df[,2]),]
p_df<-as.data.frame(t(p_df[,c(-1,-3)]))
colnames(p_df) <- as.character(unlist(p_df[1,]))
p_df = p_df[-1, ]
p_df$Result<-result
not_na<-apply(p_df,1,function(x){!any(is.na(x))})
p_df<-p_df[not_na,]
p_df$Result<-as.factor(p_df$Result)
num_val<-apply(p_df[,-length(p_df)],2,function(x){round(as.numeric(as.character(x)),digits = 3)})
p_df[,-length(p_df)]<-as.data.frame(num_val)
#distribution0<-apply(p_df[p_df$Result=="0",-length(p_df)],1,mean)
#distribution1<-apply(p_df[p_df$Result=="1",-length(p_df)],1,mean)
#distribution
#plot(distribution0)
#points(distribution1,col = "red")
p_df[,-length(p_df)]<-apply(p_df[,-length(p_df)],1,function(x){round((x-mean(x))/sd(x),digits = 3)})
#distribution0<-apply(p_df[p_df$Result=="0",-length(p_df)],1,mean)
#distribution1<-apply(p_df[p_df$Result=="1",-length(p_df)],1,mean)
#distribution
#plot(distribution0)
#points(distribution1,col = "red")
write.csv(p_df,"DRA_Data2.csv")