-
Notifications
You must be signed in to change notification settings - Fork 0
/
Compile Data For Galaxy.R
228 lines (188 loc) · 9.26 KB
/
Compile Data For Galaxy.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
## These functions compile the TraceFinder data into the correct for to submit to Galaxy for NA correction.
read_TF_reports <- function(TF_FileList, TempMatrix){
## Create a matrix to hold peak area values from each report.
PeakAreas<-matrix(NA, nrow = dim(TempMatrix)[1], ncol = length(TF_FileList ))
## Loop through all of the reports and pull out the peak areas from each report into
## the PeakAreas matrix.
for (i in 1:length(TF_FileList))
{
TempMatrix <- report_read_check(TF_FileList[i])
report_empty_check(TempMatrix, TF_FileList[i])
TempMatrix <- TempMatrix[1:(dim(TempMatrix)[1]),]
TempMatrix <- report_column_check(TempMatrix, TF_FileList[i])
PeakAreas[,i]= matrix(TempMatrix$Peak.Area)
}
return(list(PeakAreas=PeakAreas))
}
##################################
## Put Chemical Formulas from TraceFinder into a Form that Galaxy Likes
##################################
correct_chemical_formulas <- function(PeakAreas, TempMatrix){
## Create a matrix with the same number of rows as the data (PeakAreas) and 8 columns.
CompoundNamesAndFormulasForStripping <- as.data.frame(matrix("0", nrow = dim(PeakAreas)[1], ncol = 9), stringsAsFactors = FALSE)
## Name the columns.
colnames(CompoundNamesAndFormulasForStripping)=c("CompoundName",
"Formula",
"Peak_Index",
"C_isomers",
"H_isomers",
"N_isomers",
"O_isomers",
"P_isomers",
"S_isomers")
######## Name the rows.
CompoundNamesAndFormulasForStripping$CompoundName <- as.character(rownames(PeakAreas))
## Copy the formulas from the TraceFinder data.
CompoundNamesAndFormulasForStripping$Formula <- as.character(TempMatrix$Formula)
## Copy the Peak_Index from the TraceFinder data.
CompoundNamesAndFormulasForStripping$Peak_Index <- as.numeric(as.character(TempMatrix$Conf.Peak.Index))
## For each row in the data change the chemical formula to add in 1's to the formula name
## and set the number of isomers correctly.
for (i in 1:(dim(CompoundNamesAndFormulasForStripping)[1]))
{
## Create a variable to hold the formula of the current row.
## ex. C5H8O4
form=CompoundNamesAndFormulasForStripping[i,2]
## Make a list where each number is the position of each letter in the formula,
## and the last number is the number of total characters plus 1.
## ex. C5H8O4 generates 1 3 5 7 C is at 1, H at 3, O and 5 and there are 6 characters
## in the formula.
ups = c(gregexpr("[[:upper:]][[:lower:]]*", form)[[1]], nchar(form) + 1)
## Seperate each element and its number of atoms into a list.
## ex. C5 H8 O4
seperated = sapply(1:(length(ups)-1), function(x) substr(form, ups[x], ups[x+1] - 1))
## Strip off the number of atoms from each limit and put it in another list.
## ex. C H O
elements = gsub("[[:digit:]]", "", seperated)
## Strip off the elements and put just the number of atoms into another list.
## ex. 5 8 4 Elements with 1 atom will be a blank space.
nums = gsub("[[:alpha:]]", "", seperated)
## Create a boolean vector where elements with a number of atoms greater than 1
## become FALSE and elements with a number of atoms equal to 1 become TRUE.
## ex C5H8O4 becomes FALSE FALSE FALSE CH8O4 becomes TRUE FALSE FALSE
Adjust=(nums=="")
## Use the boolean vector just created to replace the blank spaces with a 1.
nums[Adjust]=1
## Create a blank string.
newform=""
## Rebuild the formula. This will look the same if all the elements have a
## number of atoms greater than 1. This will add in a 1 next to the elements
## that don't.
for (j in 1:length(seperated))
{
newform=paste (newform,elements[j], sep = "")
newform=paste (newform,nums[j], sep = "")
}
## Set the formula in the matrix equal to the newly built formula.
CompoundNamesAndFormulasForStripping[i,2]=newform
## Set the number of isomers for each row.
## If the formula contains a "C" then set the C_isomer column equal to
## the number of carbon atoms in the formula.
## Do this for each isomer element, Hydrogen, Nitrogen, etc.
if (sum(match(elements,"C",nomatch=0))==1)
{
CompoundNamesAndFormulasForStripping[i,4]=nums[match(elements,"C",nomatch=0)>0]
} else {
CompoundNamesAndFormulasForStripping[i,4] <- "NA"
}
if (sum(match(elements,"H",nomatch=0))==1)
{
CompoundNamesAndFormulasForStripping[i,5]=nums[match(elements,"H",nomatch=0)>0]
} else {
CompoundNamesAndFormulasForStripping[i,5] <- "NA"
}
if (sum(match(elements,"N",nomatch=0))==1)
{
CompoundNamesAndFormulasForStripping[i,6]=nums[match(elements,"N",nomatch=0)>0]
} else {
CompoundNamesAndFormulasForStripping[i,6] <- "NA"
}
if (sum(match(elements,"O",nomatch=0))==1)
{
CompoundNamesAndFormulasForStripping[i,7]=nums[match(elements,"O",nomatch=0)>0]
} else {
CompoundNamesAndFormulasForStripping[i,7] <- "NA"
}
if (sum(match(elements,"P",nomatch=0))==1)
{
CompoundNamesAndFormulasForStripping[i,8]=nums[match(elements,"P",nomatch=0)>0]
} else {
CompoundNamesAndFormulasForStripping[i,8] <- "NA"
}
if (sum(match(elements,"S",nomatch=0))==1)
{
CompoundNamesAndFormulasForStripping[i,9]=nums[match(elements,"S",nomatch=0)>0]
} else {
CompoundNamesAndFormulasForStripping[i,9] <- "NA"
}
}
return(CompoundNamesAndFormulasForStripping)
}
###################################
## Put the peak areas and corrected chemical formulas into the final form to submit to Galaxy
##################################
build_final_matrix <- function(Labelling, CompoundNamesAndFormulasForStripping, SampleNames, PeakAreas, Isotopologue_Database){
## If the labelling is only C13 then make a matrix with 5 columns, otherwise if the
## labelling is C13 and N15 then make a matrix with 6 columns.
if (Labelling=="C13")
{
## Create a matrix with no values.
ForStripping=matrix(nrow=0, ncol = 5)
## Name the columns.
colnames(ForStripping)=c("Compound","Mol_Formula","C_isomers","SamplID","Intensity")
## Create a new matrix and fill it with 0's.
TempForStripping=matrix("0",nrow=1, ncol = 5)
}
if (Labelling=="C13N15")
{
ForStripping=matrix(nrow=0, ncol = 6)
colnames(ForStripping)=c("Compound","Mol_Formula","C_isomers","N_isomers","SamplID","Intensity")
TempForStripping=matrix("0",nrow=1, ncol = 6)
}
## For each column in the data.
for (i in 1:(dim(PeakAreas)[2]))
{
## For each row in the data.
for (j in 1:(dim(CompoundNamesAndFormulasForStripping)[1]))
{
## Put the compound nae in the temporary matrix.
TempForStripping[,1] <- CompoundNamesAndFormulasForStripping[j,1]
## Put the chemical formula in the temporary matrix.
TempForStripping[,2] <- CompoundNamesAndFormulasForStripping[j,2]
if (Labelling=="C13")
{
## Put the isotopologue number in the temporary matrix.
TempForStripping[,3]=Isotopologue_Database[Isotopologue_Database$Unique_ID == paste(CompoundNamesAndFormulasForStripping[j,1], CompoundNamesAndFormulasForStripping[j,3], sep = " " ), "C_Isotopologue"]
## Put the full column name (sample name) in the temp matrix.
TempForStripping[,4]=SampleNames[i]
## Put the peak area value in the temp matrix.
TempForStripping[,5]=PeakAreas[j,i]
## Copy the temporary matrix without row and column names to the temp matrix
## with row and column names.
## Not filtering out peack values of 0 anymore so no data is lost.
## Add only rows with peak areas greater than 0 or isomers of 0.
#if(TempForStripping[,3] == 0 || TempForStripping[,5] > 0){
ForStripping=rbind(ForStripping,TempForStripping)
#}
## Zero out the temp matrix.
TempForStripping=matrix("0",nrow=1, ncol = 5)
}
## Same as for C13 labelling but also add in the Nitrogen labeling to the matrix.
if (Labelling=="C13N15")
{
TempForStripping[,3]=Isotopologue_Database[Isotopologue_Database$Unique_ID == paste(CompoundNamesAndFormulasForStripping[j,1], CompoundNamesAndFormulasForStripping[j,3], sep = " " ), "C_Isotopologue"]
TempForStripping[,4]=Isotopologue_Database[Isotopologue_Database$Unique_ID == paste(CompoundNamesAndFormulasForStripping[j,1], CompoundNamesAndFormulasForStripping[j,3], sep = " " ), "N_Isotopologue"]
TempForStripping[,5]=SampleNames[i]
TempForStripping[,6]=PeakAreas[j,i]
## Not filtering out peak values of 0 anymore so no data is lost.
## Add only rows with peak areas greater than 0 or isomers of 0.
#if((TempForStripping[,3] == 0 && TempForStripping[,4] == 0) || TempForStripping[,6] > 0){
ForStripping=rbind(ForStripping,TempForStripping)
# }
TempForStripping=matrix("0",nrow=1, ncol = 6)
}
}
# }
}
return(ForStripping)
}