-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathgdc_parser.R
More file actions
executable file
·349 lines (317 loc) · 16.4 KB
/
gdc_parser.R
File metadata and controls
executable file
·349 lines (317 loc) · 16.4 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
library(RCurl)
library(jsonlite)
library(XML)
gdc_end = function(endpoint, legacy = FALSE, version = NULL) {
if (!is.null(version)) {
version = paste0("", version, "/")
} else {
version = ""
}
if (legacy) {
paste0("https://api.gdc.cancer.gov/", version, "legacy/", endpoint)
} else {
paste0("https://api.gdc.cancer.gov/", version, endpoint)
}
}
gdc_filt = function(filters, auto_unbox = TRUE, pretty = FALSE) {
library(RCurl)
library(jsonlite)
curlEscape(toJSON(filters, auto_unbox = auto_unbox, pretty = FALSE))
}
gdc_url = function(end = "cases", legacy = FALSE, filters = NULL, cust_ins = NULL, extras = NULL, size = NULL, fields = NULL) {
if (length(cust_ins) > 1) {
cust_ins = paste0(cust_ins, collapse = ",")
}
if (length(extras) > 1) {
cust_ins = paste0(cust_ins, extra)
}
url = gdc_end(endpoint = end, legacy = legacy)
if (!is.null(filters) | !is.null(cust_ins) | !is.null(extras) | !is.null(fields) | !is.null(size)) {
url = paste0(url, "?")
}
if (end %in% c("data", "manifest")) {
url = paste0(url, "/", cust_ins)
## return(paste0(gdc_end(end, legacy = legacy), "/", cust_ins))
}
if (!is.null(filters)) {
if (is.null(extras) || nchar(extras) == 0) {
## return(paste0(gdc_end(end, legacy = legacy), "?filters=", gdc_filt(filters), "&pretty=true"))
## return(paste0(gdc_end(end, legacy = legacy), "?filters=", gdc_filt(filters)))
url = paste0(url, "filters=", gdc_filt(filters))
} else {
## return(paste0(gdc_end(end, legacy = legacy), "?filters=", gdc_filt(filters), "&pretty=true", extras))
## return(paste0(gdc_end(end, legacy = legacy), "?filters=", gdc_filt(filters), extras))
url = paste0(url, "filters=", gdc_filt(filters), extras)
}
} else {
if (is.null(extras) || nchar(extras) == 0) {
## return(paste0(gdc_end(end, legacy = legacy), "/", cust_ins, "&pretty=true"))
## return(paste0(gdc_end(end, legacy = legacy), "/", cust_ins))
## url = paste0(gdc_end(end, legacy = legacy), "/", cust_ins)
url = paste(c(url, cust_ins), collapse = "/")
} else {
## return(paste0(gdc_end(end, legacy = legacy), "/", cust_ins, extras, "&pretty=true"))
## return(paste0(gdc_end(end, legacy = legacy), "/", cust_ins, extras))
## url = paste0(gdc_end(end, legacy = legacy), "/", cust_ins, extra)
url = paste(c(url, cust_ins), collapse = "/")
}
}
if (!is.null(size)) {
url = sprintf("%s&size=%s", url, size)
}
if (!is.null(fields)) {
url = sprintf("%s&fields=%s", url, fields)
}
url = sub("(\\?)\\&", "\\1", url)
return(url)
}
## fields_example = "files.experimental_strategy,files.file_name,files.file_id,files.center.short_name,files.data_type,files.data_category,files.metadata_files.file_name"
fields_example = "disease_type,primary_site,files.experimental_strategy,files.file_name,files.file_id,files.center.short_name,files.data_type,files.data_category,files.metadata_files.data_format,files.metadata_files.data_type,files.metadata_files.data_category,files.metadata_files.file_name,files.metadata_files.file_id,files.metadata_files.submitter_id"
gdccurl = function(url, fields = "", download = FALSE, download_dir = NULL, size = NULL) {
if (!download) {
cmd = paste0("curl ", "'", url)
if (nchar(fields) > 0) {
cmd = paste0(cmd, "&fields=", fields)
}
if (!is.null(size)) {
cmd = paste0(cmd, sprintf("&size=%s", size))
}
## cmd = paste0(cmd, "'")
## return(fromJSON(system(cmd, intern = T)))
json_input = RCurl::getURL(url)
return(fromJSON(json_input))
} else if (grepl("\\/data\\/|manifest", url)) {
if (!is.null(download_dir)) {
tmp.env123987 = new.env()
assign("orig_wd", getwd(), envir = tmp.env123987)
setwd(download_dir)
}
return(system(paste0("curl --remote-name --remote-header-name ", "'", url, "'")))
try(setwd(tmp.env123987$orig_wd))
}
}
analysis_grab = function(xmllist) {
dt = as.data.table(t(xmllist$ANALYSIS_SET$ANALYSIS$.attrs))
aset = xmllist$ANALYSIS_SET$ANALYSIS
label = paste(xmllist$ANALYSIS_SET$ANALYSIS$TITLE, xmllist$ANALYSIS_SET$ANALYSIS$DESCRIPTION)
## assembly = xmllist$ANALYSIS_SET$ANALYSIS$ANALYSIS_TYPE$REFERENCE_ALIGNMENT$ASSEMBLY[[1]]
assembly = unlist(aset$ANALYSIS_TYPE$REFERENCE_ALIGNMENT$ASSEMBLY)
dt[, label := label]
dt[, assembly := assembly]
lst_of_pipelines = xmllist$ANALYSIS_SET$ANALYSIS$ANALYSIS_TYPE$REFERENCE_ALIGNMENT$PROCESSING$PIPELINE
set(dt, i = 1L, j = "lst_of_pipelines", value = list(list(lst_of_pipelines)))
starts = regexpr("TCGA\\-", label)
clean_ident = unlist(lapply(strsplit(substring(label, starts), "\\:"), function(x) x[1]))
legacy_sample_id = substring(clean_ident, 1, 28)
sample = substring(legacy_sample_id, 1, 25)
## low_passes = grepl("(Low pass)|(Low Pass)", label) ## original
low_passes = any(grepl("(low pass)|(lowpass)", tolower(unlist(xmllist))))
tss_id = substring(legacy_sample_id,first = 6, last = 7)
sample_type = substring(legacy_sample_id,first = 14, last = 15)
analyte_code = substring(legacy_sample_id,first = 20, last = 20)
dt[, legacy_sample_id := legacy_sample_id]
dt[, sample := sample]
dt[, low_passes := low_passes]
dt[, tss_id := tss_id]
dt[, sample_type := sample_type]
dt[, analyte_code := analyte_code]
#' to access each entry of lst_of_pipelines dt$lst_of_pipelines[[1]]
return(dt)
}
grab_xml = function(these_xml, mc.cores = 1, mc.preschedule = FALSE) {
mclapply(these_xml, function(this_xml)
{
this = try({
tmp = xmlToList(this_xml)
message("read in: ", this_xml, "\n")
dt = analysis_grab(tmp)
dt[, analysis_file_name := basename(this_xml)]
dt[, analysis_local_path := this_xml]
return(dt)
})
## if (this == "try-error") {
## warning(this_xml, " has produced an error")
## }
this
}, mc.cores = mc.cores, mc.preschedule = mc.preschedule)
}
grab_by_file_id = function(file_ids, mc.cores = 1, mc.preschedule = TRUE, size = 1)
{
mclapply(file_ids, function(ix)
{
tryCatch(
{
filters = list(op = "in", content = list(field = "files.file_id", value = ix))
url = gdc_url("files", legacy = TRUE, filters)
bla = gdccurl(url, size = size)
dt = as.data.table(bla$data$hits)
bla2 = gdccurl(url, fields = "cases.case_id,cases.submitter_id", size = size)
dt2 = as.data.table(bla2$data$hits$cases)
setnames(dt2, paste0("cases.", names(dt2)))
build_q = cbind(dt, dt2)
file_nm = build_q$file_name
filters2 = list(op = "in", content = list(field = "files.file_name", value = file_nm))
url2 = gdc_url("files", TRUE, filters2)
bla2 = gdccurl(url, fields = "metadata_files.file_id,metadata_files.file_name,metadata_files.data_format,metadata_files.data_category,metadata_files.data_type", size = size)
analysis_file = as.data.table(bla2$data$hits$metadata_files[[1]])[data_type == "Analysis Metadata"]
setnames(analysis_file, paste0("analysis_", names(analysis_file)))
analysis_file[, analysis_id := sub("\\_analysis.xml", "", analysis_file_name)]
build_q = cbind(build_q, analysis_file)
build_q[, id := NULL]
return(build_q)
}, error = function(e) "try-error")
}, mc.cores = mc.cores, mc.preschedule = mc.preschedule)
}
## make_chunks = function(vec, num_per_chunk = 100) {
## len_to = ceiling(length(vec)/num_per_chunk)
## ids = split(1:length(vec), rep(1:len_to, length.out = length(vec)))
## return(lapply(ids, function(i) vec[i]))
## }
grab_by_file_id2 = function(file_ids, max_per_chunk = 100) { ## more than 100 crashes
## make_chunks = function(vec, num_per_chunk = 100) {
## len_to = ceiling(length(vec)/num_per_chunk)
## ids = split(1:length(vec), rep(1:len_to, length.out = length(vec)))
## return(lapply(ids, function(i) vec[i]))
## }
make_chunks = function(vec, num_per_chunk = 100) {
require(S4Vectors)
require(parallel)
ind = parallel::splitIndices(length(case_id), max(length(case_id) / max_per_chunk))
split(case_id, rep(seq_along(ind), times = elementNROWS(ind)))
}
chunked_ids = make_chunks(file_ids, max_per_chunk)
ret_lst = lapply(seq_along(chunked_ids), function(i) {
## browser(expr = {i == 2})
size = length(chunked_ids[[i]])
filters = list(op = "in", content = list(field = "files.file_id", value = chunked_ids[[i]]))
fields = "files.file_id,file_id,cases.case_id,cases.submitter_id,cases.samples.sample_id,cases.files.submitter_id,metadata_files.file_id,metadata_files.file_name,metadata_files.data_format,metadata_files.data_category,metadata_files.data_type"
url = gdc_url(end = "files", legacy = TRUE, filters = filters, size = size, fields = NULL)
file_meta1 = gdccurl(url = url)
dt = as.data.table(file_meta1$data$hits)
dt$file_id = file_meta1$data$hits$id
## url = gdc_url(end = "files", legacy = TRUE, filters = filters, size = size, fields = "cases.case_id,cases.submitter_id")
url = gdc_url(end = "files", legacy = TRUE, filters = filters, size = size, fields = fields)
file_meta2 = gdccurl(url = url)
dt2 = as.data.table(rbindlist(file_meta2$data$hits$cases))
s_df = dt2[, samples]
s_df = as.data.table(do.call("rbind", s_df))
dt2[, samples := NULL]
dt2 = cbind(dt2, s_df)
## dt2[, file_id := file_meta2$data$hits$file_id]
dt2$file_id = file_meta2$data$hits$id
setnames(dt2, paste0("cases.", names(dt2)))
dt2$file_id = file_meta2$data$hits$id
## dt2[, file_id := file_meta2$data$hits$file_id]
build_q = dplyr::left_join(dt, dt2, by = "file_id") %>% setDT()
## build_q = cbind(dt, dt2)
file_nm = build_q$file_name
filters2 = list(op = "in", content = list(field = "files.file_name", value = file_nm))
url2 = gdc_url(end = "files", legacy = TRUE, filters = filters2, size = size, fields = "metadata_files.file_id,metadata_files.file_name,metadata_files.data_format,metadata_files.data_category,metadata_files.data_type")
file_meta3 = gdccurl(url2)
analysis_file = mapply(function(fl, id) {if (!is.null(fl)) fl$orig_file_id = id; fl}, file_meta3$data$hits$metadata_files, file_meta3$data$hits$id, SIMPLIFY = FALSE) %>% lapply(function(df) df[df[["data_type"]] == "Analysis Metadata",]) %>% rbindlist() %>% as.data.table()
## analysis_file = as.data.table(rbindlist(lapply(file_meta3$data$hits$metadata_files, function(df) df[df[["data_type"]] == "Analysis Metadata",])))
## setnames(analysis_file, paste0("analysis_", names(analysis_file)))
setnames(analysis_file, names(analysis_file) %>% {c(paste0("analysis_", head(., -1)), tail(., 1))})
analysis_file[, analysis_id := sub("\\_analysis.xml", "", analysis_file_name)]
## build_q = cbind(build_q, analysis_file)
build_q = dplyr::left_join(build_q, analysis_file, by = c("file_id" = "orig_file_id")) %>% setDT()
build_q[, id := NULL]
})
## browser()
return(rbindlist(ret_lst))
}
grab_by_cases_submitter_id = function(case_id, legacy = FALSE, max_per_chunk = 100) { ## more than 100 crashes
## make_chunks = function(vec, num_per_chunk = 100) {
## len_to = ceiling(length(vec)/num_per_chunk)
## ids = split(1:length(vec), rep(1:len_to, length.out = length(vec)))
## return(lapply(ids, function(i) vec[i]))
## }
make_chunks = function(vec, num_per_chunk = 100) {
require(S4Vectors)
require(parallel)
ind = parallel::splitIndices(length(case_id), max(length(case_id) / max_per_chunk))
split(case_id, rep(seq_along(ind), times = elementNROWS(ind)))
}
chunked_ids = make_chunks(case_id, max_per_chunk)
ret_lst = lapply(seq_along(chunked_ids), function(i) {
## browser(expr = {i == 2})
size = length(chunked_ids[[i]])
filters = list(op = "in", content = list(field = "cases.submitter_id", value = chunked_ids[[i]]))
url = gdc_url(end = "cases", legacy = legacy, filters = filters, size = size, fields = NULL)
file_meta1 = gdccurl(url = url, fields = fields_example)
dt = as.data.table(file_meta1$data$hits)
return(dt[,sapply(dt, function(x) !inherits(x, c("list", "List", "AsIs"))),with = FALSE])
})
## browser()
return(rbindlist(ret_lst))
}
grab_by_file_name = function(file_names, mc.cores = 1, mc.preschedule = TRUE, size = 1)
{
mclapply(file_names, function(ix)
{
tryCatch(
{
filters2 = list(op = "in", content = list(field = "files.file_name", value = ix))
url2 = gdc_url("files", TRUE, filters2)
bla2 = gdccurl(url2, fields = "metadata_files.file_id,metadata_files.file_name,metadata_files.data_format,metadata_files.data_category,metadata_files.data_type", size = size)
analysis_file = as.data.table(bla2$data$hits$metadata_files[[1]])[data_type == "Analysis Metadata"]
setnames(analysis_file, paste0("analysis_", names(analysis_file)))
analysis_file[, analysis_id := sub("\\_analysis.xml", "", analysis_file_name)]
file_id = bla2$data$hits$id
filters = list(op = "in", content = list(field = "files.file_id", value = file_id))
url = gdc_url("files", legacy = TRUE, filters)
bla = gdccurl(url, size = size)
dt = as.data.table(bla$data$hits)
bla2 = gdccurl(url, fields = "cases.case_id,cases.submitter_id", size = size)
dt2 = as.data.table(bla2$data$hits$cases)
setnames(dt2, paste0("cases.", names(dt2)))
build_q = cbind(dt, dt2)
build_q = cbind(build_q, analysis_file)
build_q[, id := NULL]
return(build_q)
}, error = function(e) "try-error")
}, mc.cores = mc.cores, mc.preschedule = mc.preschedule)
}
library(httr)
library(jsonlite)
getBarcode <- function(uuid, legacy = TRUE){
# Get manifest using the API
uuid <- tolower(uuid)
baseURL <- ifelse(legacy,"https://api.gdc.cancer.gov/legacy/files/?","https://api.gdc.cancer.gov/files/?")
options.pretty <- "pretty=true"
options.expand <- "expand=cases.samples.portions.analytes.aliquots"
options.field <- "fields=cases.samples.portions.analytes.aliquots.submitter_id"
option.size <- paste0("size=",length(uuid))
option.format <- paste0("format=JSON")
options.filter <- paste0("filters=",
URLencode('{"op":"and","content":[{"op":"in","content":{"field":"files.file_id","value":['),
paste0('"',paste(uuid,collapse = '","')),
URLencode('"]}}]}'))
url <- paste0(baseURL,paste(options.pretty, options.expand,option.size,
options.filter, options.field,
option.format, sep = "&"))
json <- tryCatch(
fromJSON(url, simplifyDataFrame = TRUE),
error = function(e) {
fromJSON(content(GET(url), as = "text", encoding = "UTF-8"), simplifyDataFrame = TRUE)
}
)
dt = rrrepeated(json$data$hits$cases, 8, function(x) unlist(x, recursive = FALSE)) %>% do.call(what = "rbind") %>% setRownames(nm = NULL) %>% as.data.table()
dt = dt[, list(file_id = json$data$hits$id, aliquot_id, legacy_sample_barcode = submitter_id)]
## df <- stack(unlist(json$data$hits))
## barcode <- df[grep("TCGA",df[,1]),1]
## df <- data.frame(uuid = uuid, barcode = barcode)
return(dt)
}
grab_barcode_by_file_id = function(file_ids, max_per_chunk = 100, legacy = TRUE) { ## more than 100 crashes
make_chunks = function(vec, num_per_chunk = 100) {
len_to = ceiling(length(vec)/num_per_chunk)
ids = split(1:length(vec), rep(1:len_to, length.out = length(vec)))
return(lapply(ids, function(i) vec[i]))
}
chunked_ids = make_chunks(file_ids, max_per_chunk)
lst = lapply(seq_along(chunked_ids), function(i) {
getBarcode(chunked_ids[[i]], legacy = legacy)
})
return(rbindlist(lst))
}