From 3fd24f8b3e37b72c2d4d72eeca7c7ff4dd9f164a Mon Sep 17 00:00:00 2001 From: kycw <40841907+kycw@users.noreply.github.com> Date: Thu, 21 Jan 2021 16:34:30 -0700 Subject: [PATCH] Add files via upload "Jan 21: +1 problem fixed, strand info added, unique ID for each ASO added, first 3 cols match required BED format" --- CovertingCoordinates/openASO_position.Rmd | 28 ++++++++++------------- 1 file changed, 12 insertions(+), 16 deletions(-) diff --git a/CovertingCoordinates/openASO_position.Rmd b/CovertingCoordinates/openASO_position.Rmd index 9a09380..b46170d 100644 --- a/CovertingCoordinates/openASO_position.Rmd +++ b/CovertingCoordinates/openASO_position.Rmd @@ -46,7 +46,7 @@ genseq_byaso$transcript_id <- gsub("\\..*","",genseq_byaso$hg38.knownGene.name) # make smol -genseq_byaso <- tail(genseq_byaso,1000) +# genseq_byaso <- head(genseq_byaso,1000) ``` ### Map the transcriptomic coordinates @@ -81,22 +81,17 @@ aso_txt_pos <- na.omit(aso_txt_pos, cols = "start") # convert the transcript position information into an IRanges format. # add an ASO group number 'aso_grp' to be able to connect two genome ranges corresponding to the same ASO, but spanning exons. -# grp_list <- lapply(aso_txt_pos, function(x){lapply(x, row.names)}) -# grp_list <- lapply(aso_txt_pos, function(x){lapply(x, row.names); x}) - -# ir <- IRanges(start = aso_txt_pos$start, width = (aso_txt_pos$end - aso_txt_pos$start), names = aso_txt_pos$transcript_id, aso_grp = rownames(aso_txt_pos)) - ir <- IRanges(start = aso_txt_pos$start, width = (aso_txt_pos$end - aso_txt_pos$start), names = aso_txt_pos$transcript_id, aso_grp = list(rownames(aso_txt_pos))) # create an EnsDB object containing genomic position information. dbfile <- system.file("extdata/EnsDb.Hsapiens.v86.sqlite", package = "EnsDb.Hsapiens.v86") db <- EnsDb(dbfile) -# latest version of X.toGenome tries to replace id with a group identifier. Defaults to using 'NAMES'/transcript id which is not unique enough to join back at the very end. +# latest version of X.toGenome tries to replace id with a group identifier. Defaults to using 'NAMES' transcript id which is not unique enough to join back at the very end. -# gr <- transcriptToGenome(ir, db) +gr <- transcriptToGenome(ir, db) # gr <- transcriptToGenome(ir, db, id = mcols(ir)[,1]) -gr <- transcriptToGenome(ir, db, id = mcols(ir)$aso_grp) +# gr <- transcriptToGenome(ir, db, id = mcols(ir)$aso_grp) ``` @@ -107,9 +102,9 @@ gr <- transcriptToGenome(ir, db, id = mcols(ir)$aso_grp) # unlist the GRanges object to be able to pull transcript start/end information without direct reference to transcript. gr_unlist <- unlist(gr, recursive = TRUE, use.names = TRUE) -gnm_df <- data.frame(loci = seqnames(gr_unlist), - starts = start(gr_unlist) - 1, - ends = end(gr_unlist), +gnm_df <- data.frame(chrom = seqnames(gr_unlist), + chromStart = start(gr_unlist), + chromEnd = end(gr_unlist) + 1, strand = strand(gr_unlist), gr_unlist@ranges@width, gr_unlist@elementMetadata@listData[["tx_id"]], gr_unlist@elementMetadata@listData[["tx_start"]], gr_unlist@elementMetadata@listData[["tx_end"]]) @@ -128,13 +123,14 @@ colnames(gnm_df)[colnames(gnm_df) == ```{r GRanges to BED format, warning = FALSE} # create new data frame that connects ASO info to generated genome range info. - -final_df <- dplyr::full_join(aso_txt_pos, gnm_df, by = c('transcript_id' = 'tx_id', 'start' = 'tx_start', 'end' = 'tx_end', )) +df <- merge(aso_txt_pos, gnm_df, by.x = c('transcript_id', 'start'), by.y = c('tx_id', 'tx_start')) # add a unique identifier -final_df$uniq_id <- paste(final_df$GeneID, "_", final_df$ASOseq) +df$uniq_id <- paste(df$GeneID, "_", df$ASOseq) -# convert the data frame to bed format. +# reorder columns to match BED format. +col_order <- c("chrom", "chromStart", "chromEnd", "uniq_id", "transcript_id", "ASOseq", "aso_rc_seq", "ASOeffective","strand","start", "end", "tx_end", "width", "GeneID", "OriginalName", "HGNC.ID", "hg38.knownGene.name", "Sequence") +final_df <- df[ , col_order] # write.csv(final_df, file = "openASO_gr.bed", quote = F, sep = "\t", row.names = F, col.names = F) ```