This file provides a detailed description of how the results presented in the manuscript were generated, including but not limited to the processing of raw data, and the codes developed/employed for analysis.
| Software | |
|---|---|
| FastQC v0.11.9 | https://www.bioinformatics.babraham.ac.uk/projects/fastqc/ |
| Trim Galore! v0.6.6 | https://www.bioinformatics.babraham.ac.uk/projects/trim_galore/ |
| Bowtie2 v2.4.2 | http://bowtie-bio.sourceforge.net/bowtie2/index.shtml |
| HISAT2 v2.1.0 | https://ccb.jhu.edu/software/hisat2/index.shtml |
| samtools v1.9 | http://samtools.sourceforge.net/ |
| macs2 v2.2.7.1 | https://github.com/macs3-project/MACS/wiki/Install-macs2 |
| Deeptools v3.5.0 | https://deeptools.readthedocs.io/en/develop/index.html |
| Homer v4.11 | http://homer.ucsd.edu/homer/ |
| featureCounts v2.0.1 | https://subread.sourceforge.net/ |
| picard toolkit v2.26.0 | http://broadinstitute.github.io/picard/ |
| IGV v2.8.10 | http://software.broadinstitute.org/software/igv/ |
| R v4.2.3 | https://www.r-project.org/ |
The human reference genome is downloaded from Ensembl. The rRNA sequences and annotation of repeat RNAs for human genome assemblies (hg38) are downloaded from UCSC Genome Browser.
- Install some softwares
conda install -c bioconda trim-galore bowtie2 fastqc hisat2 macs2 homer subread samtools
- Build index for rRNA alignment:
bowtie2-build human_rRNA.fasta human_rRNA_index/human_rRNA
- Build index for genome alignment:
hisat2-build -p 20 Homo_sapiens.GRCh38.release94.dna.onlychr.fa hisat2_index/hg38_genome
hisat2-build -p 20 m6A_spikein.fa spikein_index/m6A_spikein
bowtie2-build --threads 20 dm6.fa dm6_genome
cd $workdir/1_quality_control && mkdir -p fastqc/logs
ls *.fastq.gz | while read id;
do
name=$(basename $id .fq.gz)
fastqc -t 20 -o ./fastqc $id &>./fastqc/logs/${name}.log
done
multiqc ./fastqc/ -f -o ./fastqc &>./fastqc/multiqc.log
cd $workdir/2_trimmed_data
for i in ${sample[@]}; do
trim_galore -q 20 -j 30 --phred33 --nextera --stringency 3 --paired --length 36 --fastqc -o . $i*"fastq.gz" >> ./trim_galore_log.txt
done
cd $workdir/3_mapping
mkdir -p logs
for i in ${sample[@]} ; do
echo $i
bowtie2 --local -X 2000 \
-p 30 -x ${hg38_index} \
-1 ../2_trimmed_data/${i}_R1_val_1.fq.gz \
-2 ../2_trimmed_data/${i}_R2_val_2.fq.gz 2> logs/${i}_hg38.txt | samtools view -@ 20 -h -S -b| samtools sort -@ 20 -o ./${i}_hg38.sorted.bam
echo ''
done
for i in *.bam; do samtools index $i & done
cd $workdir/4_final_bam
mkdir -p logs
for i in ` ls ../3_mapping/*bam `;do
name=`basename $i .bam`
picard MarkDuplicates I=$i O=${name}.dedup.bam M=${name}.dedup.log REMOVE_DUPLICATES=false CREATE_INDEX=true VALIDATION_STRINGENCY=LENIENT ASSUME_SORTED=true 2>./logs/${name}.log
done
for i in *.dedup.bam; do
samtools view -h -b -F 1548 $i | samtools sort -o ${i/.bam/.filterMAPQ.bam}
samtools index ${i/.bam/.filterMAPQ.bam}
done
for i in *filterMAPQ.bam;do
mtReads=$(samtools idxstats $i | grep 'chrM' | cut -f 3)
totalReads=$(samtools idxstats $i | awk '{SUM += $3} END {print SUM}')
echo '==> mtDNA Content:' $(bc <<< "scale=2;100*$mtReads/$totalReads")'%'
samtools view -h $i |grep -v chrM |samtools sort -O bam -@ 20 -o - > ${i/.bam/}.last.bam
samtools index -@ 20 ${i/.bam/}.last.bam
done
cd $workdir/5_reads_bigwig
mkdir -p bw_files logs computeMatrix figures
for i in ` ls ../4_final_bam/*.last.bam `; do
name=`basename $i .last.bam`
bamCoverage -b $i -o ./bw_files/${name}.bw --outFileFormat bigwig --normalizeUsing RPKM -bs 50 -p 30 2> ./logs/${name}_bamCoverage.log
done
cd $workdir/6_macs2_callpeak
for i in ` ls ../4_final_bam/*.last.bam `; do
name=`basename $i _hg38.sorted.dedup.filterMAPQ.last.bam`
macs2 callpeak -t $i -g hs --nomodel --shift 100 --extsize 200 -f BAMPE -n ${name} --outdir . >> "./${name}_callpeak.log" 2>&1
done
mkdir -p merged_peaks_by_treat
for i in ` ls *rep1*narrowPeak | sed 's/.narrowPeak//' | sort | uniq `; do
rep1=$i".narrowPeak"
rep2=${rep1/rep1/rep2}
#rep2=` echo $rep1 | sed 's/rep1/rep2/' `
name=`basename $i _rep1_fc`
bedtools intersect -a $rep1 -b $rep2 -f 0.5 -F 0.5 -e | cut -f1-4,7,6 | awk -F "\t" -v OFS="\t" '{print $1,$2,$3,$4,$6,$5}'| sort | uniq > merged_peaks_by_treat/${name}.bed
done
cd $workdir/5_reads_bigwig
computeMatrix reference-point --referencePoint center \
-b 3000 -a 3000 \
-R LOVO_merge.bed \
-S ./bw_files/LOVO*.bw \
-o ./computeMatrix/LOVO_ATACpeak_matrix.gz \
--skipZeros --missingDataAsZero \
--binSize 50 \
--smartLabels \
-p 30
for i in `ls ./computeMatrix/LOVO_ATACpeak_matrix.gz`; do
name=`basename $i matrix.gz`
plotHeatmap -m $i \
-out ./figures/${name}_heatmap.pdf \
--colorMap bwr \
--heatmapWidth 8 \
--boxAroundHeatmaps no \
--xAxisLabel ''
done
for i in `ls ./computeMatrix/LOVO_ATACpeak_matrix.gz`; do
name=`basename $i matrix.gz`
plotProfile -m $i \
-out figures/${name}.pdf \
--perGroup \
--legendLocation best \
--plotHeight 8
done
cd $workdir/2_trimmed_data
for i in ${sample[@]}; do
trim_galore -q 20 -j 20 --clip_R2 3 --phred33 --illumina --stringency 3 --length 36 --paired --fastqc -o . $i*"fastq.gz" >> ./trim_galore_MERIP.log.txt
done
cd $workdir/3_rm_rRNA
for i in ${sample[@]}; do echo $i; bowtie2 -p 20 -q --un-conc-gz "./rm_rRNA/"$i"_rmrRNA.fq.gz" -x ${human_rRNA_index}/human_rRNA -1 "$workdir/2_trimmed_data/"$i".R1_val_1.fq.gz" -2 "$workdir/2_trimmed_data/"$i".R2_val_2.fq.gz" |samtools view -bS -F 4 -|samtools sort - -o "./aligned_rRNA/"$i".bam"; done
cd $workdir/4_mapping_result && mkdir -p alignment_summary multiqc_report multiqc_report_spike
for i in ${sample[@]} ; do
hisat2 -p 30 -x $hg38_index --rna-strandness RF -k 30 -1 $workdir/3_rm_rRNA/rm_rRNA/${i}_rmrRNA.fq.1.gz -2 $workdir/3_rm_rRNA/rm_rRNA/${i}_rmrRNA.fq.2.gz | samtools view -bS |samtools sort -@ 8 -o ${i}_hg38_sorted.bam
done
for i in ${sample[@]} ; do
echo $i
hisat2 -p 20 -x $spike_index --rna-strandness RF -1 $workdir/4_mapping_result/unaligned/${i}.fq.1.gz -2 $workdir/4_mapping_result/unaligned/${i}.fq.2.gz | samtools view -bS |samtools sort -@ 5 -o ${i}_spike_sorted.bam
done
for i in *.bam; do samtools index $i; done
cd $workdir/6_macs2_callpeak
for i in ` ls ../../5_final_bam/*bam `; do
name=`basename $i .bam | sed 's/.*\///'`
samtools view -@ 20 -b -f 147 $i > $name.rev1.bam
samtools view -@ 20 -b -f 99 $i > $name.rev2.bam
samtools merge -@ 20 -f $name.rev.bam $name.rev1.bam $name.rev2.bam
samtools index -@ 20 $name.rev.bam
done
for i in ` ls ../../5_final_bam/*bam `; do
name=`basename $i .bam | sed 's/.*\///'`
samtools view -@ 20 -b -f 83 $i > $name.fwd1.bam
samtools view -@ 20 -b -f 163 $i > $name.fwd2.bam
samtools merge -@ 20 -f $name.fwd.bam $name.fwd1.bam $name.fwd2.bam
samtools index -@ 20 $name.fwd.bam
done
rm -f *rev1.bam *rev2.bam *fwd1.bam *fwd2.bam
for i in ` ls ../6_macs2_callpeak/split_reads_by_strand/Input*bam `; do
treat=`echo $i | sed s'/Input/IP/'`
macs2 callpeak -t $treat -c $i -n $name --keep-dup 5 -q 0.01 -f BAM --verbose 3 --nomodel --extsize 150 -g 1.43e8 -B --outdir ./bam 2>./bam/${name}.macs.out ; done
cd $workdir/2_trimmed_data
for i in ${sample[@]}; do
trim_galore -q 20 -j 10 --phred33 --stringency 3 --length 36 --paired --fastqc -o . $i*"fastq.gz" >> ./trim_galore_log.txt
done
cd $workdir/3_mapping_result
for i in ${sample[@]} ; do
echo $i
bowtie2 -p 10 -q --local \
-x $index \
--un-conc-gz ./unmap_reads/${i}_unmap.fastq.gz \
-1 ../2_trimmed_data/${i}_R1_val_1.fq.gz \
-2 ../2_trimmed_data/${i}_R2_val_2.fq.gz 2>./logs/${i}_mapping.log | samtools view -h -S -b| samtools sort -@ 20 -m 3G -o ./${i}.sorted.bam
done
for i in ${sample[@]} ; do
echo $i
bowtie2 --local --very-sensitive --no-mixed --no-discordant \
--phred33 -I 10 -p 10 -x ${dm6_index} \
--no-overlap --no-dovetail \
-1 ../2_trimmed_data/${i}_R1_val_1.fq.gz \
-2 ../2_trimmed_data/${i}_R2_val_2.fq.gz 2> logs/${i}_dm.txt | samtools view -h -S -b -q 20 | samtools sort -@ 10 -m 3G -o ./${i}_dm.sorted.bam
done
cd $workdir/4_final_bam
for i in `ls ../3_mapping_result/MAPQ_filtering/*bam `;do
name=`basename $i .sort.bam`
picard MarkDuplicates REMOVE_DUPLICATES=true I=$i O=${name}.dedup.bam M=${name}.dedup.log 2>./logs/${name}.log
done
for i in `ls *bam`; do samtools index $i ;done
cd $workdir/5_homer_callpeak
bam=$workdir/4_final_bam
(for i in `ls $bam/*.bam `
do
name=`basename $i .dedup.bam`
makeTagDirectory ./${name}_tag
done
(for i in ` ls -d *tag | grep 'IP' `; do
findPeaks ${i} -i ${i/IP/Input} -style histone -region -size 1000 -minDist 2500 -L 0 -region -o peak/${i/_tag/}.peak.txt
done
for i in ` ls *rep1*bed | sort | uniq `; do
rep2=${i/rep1/rep2}
bedtools intersect -a $i -b $rep2 -f 0.3 -F 0.3 -e | sort > ${i/rep1_hg38.sort/overlapped}
done
cd $workdir/6_reads_bigwig
for id in ` ls ../4_final_bam/*bam `; do
name=`basename $id .dedup.bam`
bamCoverage -b $id \
-o ./bw_files/${name}".bw" \
--outFileFormat bigwig \
--binSize 50 \
--normalizeUsing RPKM \
-p 10 2> ./logs/${name}_bamCoverage.log
done
for i in ` ls ../4_final_bam/*IP*.bam `; do
name=`basename $i .dedup.bam`
bamCompare -b1 ${i} -b2 ${i/IP/Input} \
--operation log2 \
--outFileFormat bigwig \
--binSize 50 \
--scaleFactorsMethod None \
--normalizeUsing RPKM \
-o ./bw_files/${name/IP_/}_log2ratio.bw \
-p 10 2> ./logs/${name/IP_/}_bamCompare.log
done
for i in ${sample[@]};do
annotatePeaks.pl ${i}.bed hg38 1>annotation/${i}.peakAnn.xls 2>annotation/${i}.annLog.txt
done
for i in ` ls ../4_chip_final_bam/*.dedup.bam`; do
name=`basename $i .dedup.bam`
bamCoverage --binSize 50 -b $i -o bw_files/${name}.bw --outFileFormat bigwig --normalizeUsing BPM -p 16
done
computeMatrix reference-point --referencePoint center \
-S bw_files/*LOVO*bw --smartLabels \
-R merged.LOVO.peaks.bed \
--skipZeros --missingDataAsZero -p 20 \
-a 1500 -b 1500 \
-o ./computeMatrix/LOVO_METTL3.Matrix.gz \
plotHeatmap -m ./computeMatrix/LOVO_METTL3.Matrix.gz -out ./figures/LOVO_METTL3.pdf --dpi 360 --sortRegions descend --sortUsing mean --colorMap bwr --colorNumber 100000 --whatToShow "plot, heatmap and colorbar" --startLabel "" --endLabel "" -T "LOVO OLA24h/shControl" --heatmapHeight 10
plotProfile -m ./computeMatrix/LOVO_METTL3.Matrix.gz -out ./figures/LOVO_METTL3.Profile.pdf --dpi 360 --perGroup --colors "#e30b1d" "#FC9272" "#9ECAE1" "#2280BB" "#99000D" "#ff5e00" "#084594" "#116CE1" --legendLocation best --plotHeight 8 --plotWidth 9
rawdata=read.table('./LOVO_METTL3.tab',header = T)
input_ola_1=rawdata[,126:175]
input_ctrl_1=rawdata[,726:775]
ip_ola_1=rawdata[,1326:1375]
ip_ctrl_1=rawdata[,1926:1975]
library(tidyverse)
count_df.merge <- NULL
for (i in c('input_ola_1',
'input_ctrl_1',
'ip_ola_1',
'ip_ctrl_1'))
{
data=get(i)
data['mean',]=apply(data,2,mean)
colnames(data)
data$name=rownames(data)
colnames(data)
plotdata=data.table::melt(data, id.vars=c("name"),
variable.name = "Group", value.name = "RPKM")
plotdata_name=plotdata[plotdata$name=='mean',]
plotdata_name$Region2=c(seq(1,50,1))
assign(paste0(i,'.df'),plotdata_name,)
plotdata_name$Group=gsub('\\..*','',plotdata_name$Group)
colnames(plotdata_name)[3]=paste0(i,'.bpm')
count_df.merge <- bind_cols(
count_df.merge,
select(get(paste0(i,'.df')),3)
)
}
colnames(count_df.merge)= c('input_ola_1',
'input_ctrl_1',
'ip_ola_1',
'ip_ctrl_1')
count_df.merge$ola_1=log2(count_df.merge$ip_ola_1/count_df.merge$input_ola_1)
count_df.merge$ctrl_1=log2(count_df.merge$ip_ctrl_1/count_df.merge$input_ctrl_1)
plotdata=data.table::melt(count_df.merge[,9:10],
variable.name = "Treat", value.name = "BPM")
plotdata$Group=gsub('_.*','',plotdata$Treat)
library(ggpubr)
ggplot(plotdata, aes(x=Group, y=BPM,color=Group))+
geom_boxplot(outlier.colour = NA)+
scale_color_manual(values=c("black","#DD0044EE"))+
labs(x="", y="METTL3 Coverage (BPM)")+
stat_compare_means(method='t.test')+
theme(axis.text = element_text(size = 10), plot.margin = unit(c(0.5,0.5,0,0.5), "cm"),
axis.text.x = element_text(hjust = 1, vjust = 1, angle = 45))+
theme(legend.position = '',#legend.direction = "vertical", # c(0.13, 0.80)
legend.text = element_text( size = 8))+
theme(plot.margin = unit(c(0.5,0.5,0.5,0.5), "cm"))+
theme(axis.title.x = element_text(size = 10,margin = margin(t=15)))+
theme(axis.title.y = element_text(size = 10,margin = margin(r=11 )))
for i in `ls *shM3ola.vs.shCTRLola.gained_closed_peak_for_annotation.bed`;do
annotatePeaks.pl $i hg38 1>annotation/${i/bed/Ann.xls} 2>annotation/${i/bed/annLog.txt}
done
peak=c()
peakanno=as.data.frame(fread(paste0('LOVO_shM3ola.vs.shCTRLola.gained_closed_peak_for_annotation.Ann.xls')))
peakanno2=peakanno
peakanno2$Annotation=gsub("\\(.*).*","",peakanno2$Annotation)
peakanno2$Strand='.'
anno=gsub("\\(.*).*","",peakanno$Annotation)
number=as.data.frame(table(anno))
number$ratio=number$Freq/length(anno)
sample=strsplit(colnames(peakanno)[1],split=' ')[[1]][3]
rownames(number)=number$anno
peak=data.frame(sample=gsub('.gained_closed_peak_for_annotation.bed','',sample),#strand=sample[3],
`ThreeUTR`=number["3' UTR",'Freq'],`ThreeUTR_Ratio`=number["3' UTR",'ratio'],
`FiveUTR`=number["5' UTR",'Freq'],`FiveUTR_Ratio`=number["5' UTR",'ratio'],
Exon=number["exon",'Freq'],ExonRatio=number["exon",'ratio'],
Intergenic=number["Intergenic",'Freq'],IntergenicRatio=number["Intergenic",'ratio'],
Intron=number["intron",'Freq'],IntronRatio=number["intron",'ratio'],
Noncoding=number["non-coding",'Freq'],NoncodingRatio=number["non-coding",'ratio'],
Promoter=number["promoter-TSS",'Freq'],PromoterRatio=number["promoter-TSS",'ratio'],
TTS=number["TTS",'Freq'],TTSRatio=number["TTS",'ratio']) %>% rbind(peak, .)
peak[is.na(peak)]=0
colnames(peak)
mydata=peak[c(1,3,5,7,9,11,13,15,17)]
for (i in 2:9)
{mydata[,i]=mydata[,i]*100
}
mydata2=data.frame(sample='LOVO_shM3ola.vs.shCTRLola.gained_closed_peak',
Promoter=mydata$PromoterRatio,
Genebody=mydata$ThreeUTR_Ratio+mydata$FiveUTR_Ratio+mydata$ExonRatio,
Intron=mydata$IntronRatio,
Intergenic=mydata$IntergenicRatio,
ncRNA=mydata$NoncodingRatio,TTS=mydata$TTSRatio)
p2 <- ggradar(
mydata2,
base.size = 12,
axis.label.size = 6,
grid.line.width = 1,
values.radar = c("0%","25", "50%"),
axis.labels =paste0(colnames(mydata2)[-1], ' (', round(mydata2[1,-1],2),'%)'),#rep(NA, 8),
grid.min = 0, grid.mid = 25, grid.max = 50,
group.line.width = 1,
group.point.size = 2.5,
group.colours = c('#FF9966',"#AD002A","#DF8F44","#80796B","#f39b7f", "#00a087", "#3c5488", "#4dbbd5", "#e64b35","#925E9F"),
background.circle.colour = "#FFCCCC",
axis.line.colour ="#FFCCCC",
gridline.min.colour = "#FFCCCC",
gridline.max.colour = "#FFCCCC",
gridline.mid.colour = "#FFCCCC",
legend.position = "top",
legend.text.size = 8,
)+
theme(plot.background = element_blank(),
panel.background = element_blank())
The methods are as described in Figure 2d.
The methods are as described in Figure 2e.
library(scales)
data=read.table('annotation/pie_for_loss_intergenic_peak.txt',header = T)
rownames(data)=data$RNA.Species
data$LOVO.ratio=round(data$LOVO/sum(data$LOVO)*100,2)
data$RKO.ratio=round(data$RKO/sum(data$RKO)*100,2)
plotdata=data.table::melt(data[,c(1,4,5)],id.vars='RNA.Species',variable.name='Cells',value.name='Ratio')
ggplot(plotdata, aes(x =Cells , y =Ratio,fill=RNA.Species)) +
geom_bar(stat = "identity",position = 'stack',width = 0.7,col='white')+
coord_flip()+
geom_text(aes(label = paste0(Ratio,'%')),position=position_stack(vjust =0.5),
angle=45,size = 3,col='#333333')+
scale_fill_manual(values=c('#0277BD','#e7e6e188','#f57170'))+
labs(x="",y=expression(The~Fraction~of~Gained-closed~Intergenic~Peaks))+
theme( plot.margin = unit(c(0.5,0.5,0,0.5), "cm"),axis.ticks.length =unit(.1, "cm"),
axis.text.x = element_text(size = 10))+
theme(plot.margin = unit(c(0.5,0.5,0.5,0.5), "cm"))+ #调整与图片边缘的距离
theme(axis.title.x = element_text(size = 12,margin = margin(t=12)))+
guides(fill=guide_legend("RNA Species"))+
theme(legend.position = 'top')+
theme(axis.title.y = element_text(size = 12,margin = margin(r=12 )))
The methods used for Figure 2l are as described in Figure 2j.
The methods used for Figure 3d,e,f,g,i are as described in Figure 2d,e,g,j.
LOVO.hypo.intersect.repeats=bedtoolsr::bt.intersect(a='repeat.class.bed', b='LOVO_chromatin_hypo_peak.bed',F=0.2,s=T,u=T,wa=T)
RKO.hypo.intersect.repeats=bedtoolsr::bt.intersect(a='repeat.class.bed', b='RKO_chromatin_hypo_peak.bed',F=0.2,s=T,u=T,wa=T)
LOVO.hypo.intersect.repeats$family=gsub('.*@','',LOVO.hypo.intersect.repeats$V4)
RKO.hypo.intersect.repeats$family=gsub('.*@','',RKO.hypo.intersect.repeats$V4)
table(LOVO.hypo.intersect.repeats$family)
table(RKO.hypo.intersect.repeats$family)
table(LOVO.hypo.intersect.repeats$V5)
table(RKO.hypo.intersect.repeats$V5)
lovo.dotplot=as.data.frame(table(LOVO.hypo.intersect.repeats$family))
lovo.dotplot$cell='LOVO'
order_regions=order(lovo.dotplot$Freq,decreasing = T)[c(1:8)]
lovo.dotplot$group='The_Rest'
for (i in order_regions)
{
lovo.dotplot[i,'group']=as.character(lovo.dotplot$Var1)[i]
}
lovo.dotplot.new=aggregate(Freq~group+cell,lovo.dotplot,sum)
lovo.dotplot.new$ratio=lovo.dotplot.new$Freq/sum(lovo.dotplot.new$Freq)
rko.dotplot=as.data.frame(table(RKO.hypo.intersect.repeats$family))
rko.dotplot$cell='RKO'
order_regions=order(rko.dotplot$Freq,decreasing = T)[c(1:8)]
rko.dotplot$group='The_Rest'
for (i in order_regions)
{
rko.dotplot[i,'group']=as.character(rko.dotplot$Var1)[i]
}
rko.dotplot.new=aggregate(Freq~group+cell,rko.dotplot,sum)
rko.dotplot.new$ratio=rko.dotplot.new$Freq/sum(rko.dotplot.new$Freq)
rownames(rko.dotplot.new)=rko.dotplot.new$group
mergedf2=cbind(lovo.dotplot.new,rko.dotplot.new[lovo.dotplot.new$group,])
mergedf2$group=as.factor(mergedf2$group)
mergedf2$cell=as.factor(mergedf2$cell)
colnames(mergedf2)=c(colnames(mergedf2)[1:4],"RKO_group" ,"RKO_cell" , "RKO_Freq" , "RKO_ratio")
mergedf2$ratio=round(mergedf2$ratio*100,2)
mergedf2$RKO_ratio=round(mergedf2$RKO_ratio*100,2)
ggplot(mergedf2,aes(x=ratio,y=RKO_ratio))+
geom_point(size=3,col=c(rep('#66CCCC',4),'#FF99CC',rep('#66CCCC',4)))+
geom_text_repel(aes(label =group),max.overlaps=10,size=1,
point.padding = unit(0.3, "lines"),nudge_y=2,
segment.color = "black",xlim=c(0,40),
show.legend = FALSE)+
theme(legend.position = "bottom",legend.box = "vertical",panel.grid.major =element_blank(),
legend.margin=margin(t= 0, unit='cm'),legend.spacing = unit(0,"in"))
lovo.L1dotplot=LOVO.hypo.intersect.repeats[LOVO.hypo.intersect.repeats$family=='L1',]
lovo.L1dotplot$subfamily=gsub('@.*','',lovo.L1dotplot$V4)
lovo.L1dotplot.stat=as.data.frame(table(lovo.L1dotplot$subfamily))
lovo.L1dotplot.stat$cell='LOVO'
order_regions=order(lovo.L1dotplot.stat$Freq,decreasing = T)[c(1:20)]
lovo.L1dotplot.stat$group='The_Rest'
for (i in order_regions)
{
lovo.L1dotplot.stat[i,'group']=as.character(lovo.L1dotplot.stat$Var1)[i]
}
lovo.L1dotplot.stat.new=aggregate(Freq~group+cell,lovo.L1dotplot.stat,sum)
lovo.L1dotplot.stat.new$ratio=lovo.L1dotplot.stat.new$Freq/sum(lovo.L1dotplot.stat.new$Freq)
rko.L1dotplot=RKO.hypo.intersect.repeats[RKO.hypo.intersect.repeats$family=='L1',]
rko.L1dotplot$subfamily=gsub('@.*','',rko.L1dotplot$V4)
rko.L1dotplot.stat=as.data.frame(table(rko.L1dotplot$subfamily))
rko.L1dotplot.stat$cell='rko'
order_regions=order(rko.L1dotplot.stat$Freq,decreasing = T)[c(1:20)]
rko.L1dotplot.stat$group='The_Rest'
for (i in order_regions)
{
rko.L1dotplot.stat[i,'group']=as.character(rko.L1dotplot.stat$Var1)[i]
}
rko.L1dotplot.stat.new=aggregate(Freq~group+cell,rko.L1dotplot.stat,sum)
rko.L1dotplot.stat.new$ratio=rko.L1dotplot.stat.new$Freq/sum(rko.L1dotplot.stat.new$Freq)
a=intersect(rko.L1dotplot.stat.new$group,lovo.L1dotplot.stat.new$group)[1:13]
mergedf3=cbind(lovo.L1dotplot.stat.new[lovo.L1dotplot.stat.new$group%in%a,],
rko.L1dotplot.stat.new[rko.L1dotplot.stat.new$group%in%a,])
mergedf3$group=as.factor(mergedf3$group)
mergedf3$cell=as.factor(mergedf3$cell)
colnames(mergedf3)=c(colnames(mergedf3)[1:4],"RKO_group" ,"RKO_cell" , "RKO_Freq" , "RKO_ratio")
mergedf3$ratio=round(mergedf3$ratio*100,2)
mergedf3$RKO_ratio=round(mergedf3$RKO_ratio*100,2)
ggplot(mergedf3,aes(x=ratio,y=RKO_ratio))+
geom_point(size=3,col=c(rep('#66CCCC',6),rep('#FF99CC',3),rep('#66CCCC',1),rep('#FF99CC',1),rep('#66CCCC',2)))+
geom_text_repel(aes(label =group),max.overlaps=10,size=1,
point.padding = unit(0.3, "lines"),nudge_y=1,
segment.color = "black",xlim=c(0,40),
show.legend = FALSE)+
xlim(c(0,10))+ylim(c(0,10))+
theme(legend.position = "bottom",legend.box = "vertical",panel.grid.major =element_blank(),legend.margin=margin(t= 0, unit='cm'),legend.spacing = unit(0,"in"))
The methods are as described in Figure 2e.
The methods are as described in Figure 2d.
library(ComplexHeatmap)
L1_plot2=L1_plot[grep('L1PA|L1PB',rownames(L1_plot),perl = T),]
mean_m6A_level_h=Heatmap(L1_plot2[,1,drop=F],show_row_dend = FALSE,
cluster_rows = TRUE,
cluster_columns = FALSE,
col= circlize::colorRamp2(c(1,2), c("white","#984EA3")),
width = unit(20, "mm"),
show_row_names = T, border=FALSE
)
mean_m6A_level_hh=draw(mean_m6A_level_h)
orders=row_order(mean_m6A_level_hh)
m6A_FoldChange_h=Heatmap(L1_plot2[,2,drop=F],show_row_dend = FALSE,
cluster_rows = TRUE,
cluster_columns = FALSE,
col= circlize::colorRamp2(c(-2, 1), c("#377EB8","white")),
width = unit(20, "mm"),
show_row_names = T, border=FALSE
)
m6A_FoldChange_hh=draw(m6A_FoldChange_h)
orders=row_order(m6A_FoldChange_hh)
mean_RNA_abundance_h=Heatmap(L1_plot2[,3,drop=F],show_row_dend = FALSE,
cluster_rows = TRUE,
cluster_columns = FALSE,
col= circlize::colorRamp2(c(7,12), c("white","#E41A1C")),
width = unit(20, "mm"),
show_row_names = T, border=FALSE
)
mean_RNA_abundance_hh=draw(mean_RNA_abundance_h)
orders=row_order(mean_RNA_abundance_hh)
range(L1_plot2[,4,drop=F])
fivenum(L1_plot2[,4])
RNA_abundance_change_h=Heatmap(L1_plot2[,4,drop=F],show_row_dend = FALSE,
cluster_rows = TRUE,
cluster_columns = FALSE,
col= circlize::colorRamp2(c(-0.1, 0, 0.1),c("navy", "white", "firebrick3")),
width = unit(20, "mm"),
show_row_names = T, border=FALSE
)
RNA_abundance_change_hh=draw(RNA_abundance_change_h)
orders=row_order(RNA_abundance_change_hh)
range(L1_plot2[,5,drop=F])
fivenum(L1_plot2[,5])
divergence_h=Heatmap(L1_plot2[,5,drop=F],show_row_dend = FALSE,
cluster_rows = TRUE,
cluster_columns = FALSE,
col= circlize::colorRamp2(c(2,20),c("white", "#A65628")),
width = unit(20, "mm"),
show_row_names = T, border=FALSE
)
divergence_hh=draw(divergence_h)
orders=row_order(divergence_hh)
range(L1_plot2[,6,drop=F])
fivenum(L1_plot2[,6])
K9_change_h=Heatmap(L1_plot2[,6,drop=F],show_row_dend = FALSE,
cluster_rows = TRUE,
cluster_columns = FALSE,
col= circlize::colorRamp2(c(0,0.5),c( "white", "#FF7F00")),
width = unit(20, "mm")
)
K9_change_hh=draw(K9_change_h)
orders=row_order(K9_change_hh)
library(RColorBrewer)
set.seed(123)
pdf('L1_heatmap_clusterBYK9.pdf',height = 10,width=10)
hlist3=K9_change_h+RNA_abundance_change_h+m6A_FoldChange_h+divergence_h+mean_m6A_level_h+mean_RNA_abundance_h
draw(hlist3, gap = unit(0.9, "cm"),heatmap_legend_side = "left")
dev.off()
bedtools getfasta -fi Homo_sapiens.GRCh38.release94.dna.onlychr.fa -bed m6a_marked_hypo_L1PA3.bed -s > m6a_marked_hypo_L1PA3.fa
bedtools getfasta -fi Homo_sapiens.GRCh38.release94.dna.onlychr.fa -bed m6a_marked_hypo_L1PA4.bed -s > m6a_marked_hypo_L1PA4.fa
bedtools getfasta -fi Homo_sapiens.GRCh38.release94.dna.onlychr.fa -bed m6a_marked_hypo_L1PA5.bed -s > m6a_marked_hypo_L1PA5.fa
bedtools getfasta -fi Homo_sapiens.GRCh38.release94.dna.onlychr.fa -bed m6a_marked_hypo_L1PA7.bed -s > m6a_marked_hypo_L1PA7.fa
for i in `ls m6a_marked_hypo_L1PA*bed|grep -v length5000.bed `;do
awk '{if($3-$2>4999) print $0}' $i > ${i/.bed/.length5000.bed}
bedtools getfasta -fi Homo_sapiens.GRCh38.release94.dna.onlychr.fa -bed ${i/.bed/.length5000.bed} -s > ${i/.bed/.length5000.fa}
done
L1PA_CONSENSUS.R
if(T)
{
msa_res=read.csv('L1PA3_CONSENSUS.CSV',header=F)
msa_res=msa_res[,-1]
seq='GGGGGAGGAGCCAAGATGGCCGAATAGGAACAGCTCCGGTCTACAGCTCCCAGCGTGAGCGACGCAGAAGAC
GGGTGATTTCTGCATTTTCCATCTGAGGTACCGGGTTCATCTCACTAGGGAGTGCCAGACAGTGGGCGCAGG
TCAGTGGGTGCAGCGCACCGTGCGCGAGCCGAAGCAGGGCGAGGCATTGCCTCACTCGGGAAGCGCAAGGGG
TCAGGGAGTTAGTTCCCTTTCCTAGTCAAAGAAAGGGGTGACAGACGGCACCTGGAAAATCGGGTCACTCCC
ACCCGACCCGAATACTGCGCTTTTCCGACGGGCTTAAAAAAACGGCGCACCAGGAGATTATATCCCGCACCT
GGCTCGGAGGGTCCTACGCCCACGGAGTCTCGCTGGATTGCTAGCACAGCAGTCTGAGATCAAACTGCAAGG
CGGCAGCGAGGCTTGGGGGAGGGGCGCCCGCCATTGCCCAGGCTTGCTTAGGTAAACAAAGCAGCCGGGAAG
CTCGAACTGGGTGGAGCCCACCACAGCTCAAGGAGGCCTGCCTGCCTCTGTAGGCTCCACCTCTGGGGGCAG
GGCACAGACAAACAAAAAGACAGCAGTAACCTCTGCAGACTTAAATGTCCCTGTCTGACAGCTTTGAAGAGA
GCAGTGGTTCTCCCAGCACGCAGCTGGAGATCTGAGAACGGGCAGACTGCCTCCTCAAGTGGGTCCCTGACC
+CTGA+CCCTGACCCCCGAGCAGCCTAACTGGGAGGCACCCCCCCAGCAGGGGCAGACTGACACCTCACACG
GCCGGGTACTCCTCTGAGACAAAACTTCCAGAGGAACGATCAGACAGCAGCATTCGCGGTTCACGAAAATCC
GCTGTTCTGCAGCCACCGCTGCTGATACCCAGGCAAACAGGGTCTGGAGTGGACCTCTAGCAAACTCCAACA
GACCTGCAGCTGAGGGTCCTGTCTGTTAGAAGGAAAACTAACAAACAGAAAGGACATCCACACCAAAAACCC
ATCTGTACATCACCATCATCAAAGACCAAAAGTAGATAAAACCACAAAGATGGGGAAAAAAACAGAGCAGAA
AAACTGGAAACTCTAAAAAAGCAGAGCGCCTCTCCTCCTCCAAAGGAACGCAGTTCCTCACCAGCAACGGAA
CAAAGCTGGACGGAGAATGACTTTGACGAGCTGAGAGAAGAAGGCTTCAGACGATCAAATTACTCCGAGCTA
C+GGAGGAAATTCAAACCAAAGGCAAAGAAGTTGAAAACTTTGAAAAAAATTTAGAAGAATGTATAACTAGA
ATAACCAATACAGAGAAGTGCTTAAAGGAGCTGATGGAGCTGAAAACCAAGGCTCGAGAACTACGTGAAGAA
TGCAGAAGCCTCAGGAGCCGATGCGATCAACTGGAAGAAAGGGTATCAGCGATGGAAGATGAAATGAATGAA
ATGAAGCGAGAAGGGAAGTTTAGAGAAAAAAGAATAAAAAGAAACGAGCAAAGCCTCCAAGAAATATGGGAC
TATGTGAAAAGACCAAATCTACGTCTGATTGGTGTACCTGAAAGTGAAAGTGACGGGGAGAATGGAACCAAG
TTGGAAAACACTCTGCAGGATATTATCCAGGAGAACTTCCCCAATCTAGCAAGGCAGGCCAACATTCAGATT
CAGGAAATAACAGAGAACGCCACAAAGATACTCCTCGAGAAGAGCAACTCCAAGACACATAATTGTCAGATT
CACCACACATAATTGTCAGATTCACCAAAGTTGAAATGAAGGAAAAAATGTTAAGGGCAGCCAGAGAGAAAG
GTCGGGTTACCCTCAAAGGGAAGCCCATCAGACTTAACAGCGGATCTCTCGGCAGAAACTCTACAAGCCAGA
AGAGAGTGGGGGCCAATATTCAACATTCTTAAAGAAAAGAATTTTCAACCCAGAATTTCATATCCAGCCAAA
CTAAGCTTCATAAGTGAAGGAGAAATAAAATACTTTACAGACAAGCAAATGCTGAGAGATTTTGTCACCACC
AGGCCTGCCCTAAAAGAGCTCCTGAAGGAAGCACTAAACATGGAAAGGAACAACCGGTACCAGCCGCTTGCA
AAATCATGCCAAAATGTAAAGACCATCGAGACTAGGAAGAAACTGCATCAACTAACGAGCAAAATAACCAGC
TAACATCATAATGACAGGATCAAATTCACACATAACAATATTAACTTTAAAATGTAAATGGACTAAATGCTC
CAATTAAAAGACACAGACTGGCAAATTGGATAAAAGAGTCAAGACCCATCAGTGTGCTGTATTCAGGAAACC
CATCTCACGTGCAGAGACACACATAGGCTCAAAATAAAAGGATGGAGGAAGATCTACCAAGCAAATGGAAAA
CAAAAAAAAAAAGGCAGGGGTTGCAATCCTAGTCTCTGATAAAACAGACTTTTAAACCAACAAAGATCAAAA
GAGACAAAGAAGGCCATTACATAATGGTAAAGGGATCAATTCAACAAGAAGAGCTAACTATCCTAAATATAT
ATATGCACCCCAATACAGGAGCACCCAGATTCATAAAGCAAGAAGTCCTGAGTGACCTACAAAGAGACTTTA
GACTCCCACACATTAATAATGGGAGACTTTAACACCCCACTGTCAACATTAGACAGATCAACGAGACAGAAA
GTCAACAAGGATACCCAGGAATTGAACTCAGCTCTGCACCAAGCGGACCTAATAGACATCTACAGAACTCTC
CACCCCAAATCAACAGAATATACATTTTTTTTCAGCACCACACCACACCTATTCCAAAATTGACCACATACT
TGGAAGTAAAGCTCTCCTCAGCAAATGTAAAAGAACAGAAATTATAACAAACTATCTCTCAGACCACAGTGC
AATCAAACTAGAACTCAGGATTAAGAATCTCACTCAAAACCGCTCAACTACATGGAAACTGAACAACCTGCT
CCTGAATGACTACTGGGTACATAACGAAATGAAGGCAGAAATAAAGATGTTCTTTGAAACCAACGAGAACAA
AGACACAACATACCAGAATCTCTGGGACACATTCAAAGCAGTGTGTAGAGGGAAATTTATAGCACTAAATGC
CCACAAGAGAAAGCAGGAAAGATCCAAAATTGACACCCTAACATCACAATTAAAAGAACTAGAAAAGCAAGA
GCAAACACATTCAAAAGCTAGCAGAAGGCAAAGAAATAACTAAAATCAGAGCAGAACTGAAGGAAATAGAGA
CACAAAAAAACCCTTCCAAAAAATTAATGAATCCAGGAGCTGGTTGTTTTTGAAAGGATCAACAAAATTGAT
AGACCGCTAGCAAGACTAATAAAGAAAAAAAGAGAGAAGAATCAAATAGACGCAATAAAAAATGATAAAGGG
GATATCACCACCGATCCCACAGAAATACAACACTACCATCAGAGAATACTACAAACACCTCTACGCAAATAA
ACTAGAAAATCTAGAAGAAATGGATAAATTCCTCGACACATACACTCTCCCAAGACTAAACCAGGAAGAAGT
TGAATCTCTGAATAGACCAATAACAGGATCTGAAATTGTGGCAATAATCAATAGCTTACCAACCAAAAAGAG
TCCAGGACCAGATGGATTCACAGCCGAATTCTACCAGAGGTACAAGGAGGAACTGGGTACCATTCCTTCTGA
AACTATTCCAATCAATAGAAAAAGAGGGAATCCTCCCTAACTCATTTTATGAGGCCAGCATCATCCTGATAC
CAAAGCCGGGCAGAGACACAACCAAAAAAGAGAATTTTAGACCAATATCCTTCCTTGATGAACATTGATGCA
AAAATCCTCAATAAAATACTGGCAAACCGAATCCAGCAGCACATCAAAAAGCTTATCCACCATGATCAAGTG
GGCTTCATCCCTGGGATGCAAGGCTGGTTCAATATACGCAAATCAATAAATGTAATCCAGCATATAAACAGA
ACCAAAGACAAAAACCACCATGATTATCTCAATAGATGCAGAAAAGGCCTTTGACAAAATTCAACAACCCTT
CATGCTAAAAACTCTCAATAAATTAGGTATTGATGGGACGTATCTCAAAATAATAAGAGCTATCTATGACAA
ACCCACAGCCAATATCATACTGAATGGGCAAAAACTGGAAGCATTCCCTTTGAAAACTGGCACAAGACAGGG
ATGCCCTCTCTCACCACTCCTATTCAACATAGTGTTGGAAGTTCTGGCCAGGGCAATTAGGCAGGAGAAGGA
AATAAAGGGTATTCAATTAGGAAAAGAGGAAGTCAAATTGTCCCTGTTTGCAGATGACATGATTGTATATCT
AGAAAACCCCATTGTCTCAGCCCAAAATCTCCTTAAGCTGATAAGCAACTTCAGCAAAGTCTCAGGATAAAC
AAAATCAATGTACAAAAATCACAAGCATTCTTATACACCAACAACAGACAAACAGAGAGCCAAATCATGAGT
GAACTCCCATTCACAATTGCTTCAAAGAGAATAAAATACCTAGGAATCCAACTTACAAGGGATGTGAAGGAC
CTCTTCAAGGAGAACTACAAACCACTGCTCAATGAAATAAAAGAGGATACAAACAAATGGAAGAACATTCCA
TGCTCATGGGTAGGAAGAATCAATATCGTGAAAATGGCCATACTGCCCAAGGTAATTTATAGATTCAATGCC
ATCCCCATCAAGCTACCAATGACTTTCTTCACAGAATTGGAGAAAAACTACTTTAAAGTTCATATGGAACCA
AAAAAGAGCCCGCATCGCCAAGTCAATCCTAAGCCAAAAGAACAAAGCTGGAGGCATCACACTACCTGACTT
CAAACTATATACTACAAGGCTACAGTAACCAAAACAGCATGGTACTGGTACCAAAACAGAGATAATAGATCA
ATGGAACAGAACAGAGCCCTCAGAAATAACGCCGCTCATATCTACAACTATCTGATCTTTGACAAACCTGAG
AAAAACAAGCAATGGGGGAAAGGATTCCCTATTTAATAAATGGTGCTGGGAAAACTGGCTAGCCATATGTAG
AAAGCTGAAACTGGATCCCTTCCTTACACCTTATACAAAAATCAATTCAAGATGGATTAAAGACTTAAACGT
TAGACCTAAAACCATTAAAAACCCTAGAAGAAAACCTAGGCATTACCATTCAGGACATAGGCATGGGCAAGG
ACTTCATGTCTAAAACACCAAAAGCAATGGCAACAAAAGCCAAAATTGACAAATGGGATCTAATTAAACTAA
AGAGCTTCTGCACAGCAAAAGAAACTACCATCAGAGTGAACAGGCAACCTACAAAATGGGAGAAAATTTTCG
CAACCTACTCATCTGACAAAGGGCTAATATCCAGAATCTACAATGAACTCAAAACAAATTTACAAGAAAAAA
ACAAACAACCCCATCAAAAAGTGGGC+AAGGACATGAACAGACACTTCTCAAAAGAAGACATTTATGCAGCC
AAAAAAACACATGAAAAAATGCTCACCATCACTGGCCATCAGAGAAATGCAAATCAAAACCACAATGAGATA
CCATCTCACACCAGTTAGAATGGCAATCATTAAAAAGTCAGGAAACAACAGGTGCTGGAGAGGATGTGGAGA
AATAGGAACACTTTTACACTGTTGGTGGGACTGTAAACTAGTTCAACCATTGTGGAAGTCAGTGTGGCGATT
CCTCAGGGATCTAGAACTAGACTAGAAATACCATTTGACCCAGCCATCCCATTACTGGGTATATACCCAAAG
GACTATAAATCATGCTGCTATAAAGACACATGCACACGTATGTTTATTGCGGCACTATTCACAATAGCAAAG
ACTTGGAACCAACCCAAATGTCCAACAATGATAAGACTGGATTAAGAAAATGTGGCACATATACACCATGGA
ATACTATGCAGCCATAAAAAATGATGACCTTTGTAGGGACATTCGTTCATGTCCTTTGTAGGGACATGGATG
AAATTGGAAATCATCATTCTCAGTAAACTATCGCAAGAACAAAAAACCAAACACCGCATATTCTCACTCATA
GGTGGGAATTGAACAATGAGAACACATGGACACAGGAAGGGGAACAATCACACTCTGGGGACTGTTGTGGGG
TGGGGGGAGGGGGGAGGGGGGAGGGATAGCATTGGGAGATATACCTAATGCTAGATGACGAGTTTAGTGGGT
GCAGCGCACCAGCATGGCACATGTATACATACATATGTAACTAACCTGCACATTGTGCACATGTACCCTAAA
ACTT+AAAGTATAATAATAATAAAAAAAAAA'
seq=strsplit(seq,split='')
seq_filter=as.data.frame(seq[[1]])
seq_filter=as.data.frame(seq_filter[seq_filter[1]!='\n',])
msa_res_t=as.data.frame(t(msa_res))
msa_res_t=msa_res_t[-nrow(msa_res_t),]
df=cbind(msa_res_t,seq_filter)
library(dplyr)
library(tidyr)
df_raw <- df %>% separate(V2, c("g1","g2","g3","g4"), "[;]")
df_raw$G=0
df_raw$A=0
df_raw$T=0
df_raw$C=0
library(stringr)
for (i in 1:nrow(df_raw))
{
#gsub(".* (.*)%",'\\1',df_raw[1:5,7],perl = T)
df_raw[i,'G']=ifelse(str_detect(df_raw[i,'g1'],"G"),as.numeric(gsub(".* (.*)%",'\\1',df_raw[i,'g1'],perl = T)),
ifelse(str_detect(df_raw[i,'g2'],"G"),as.numeric(gsub(".* (.*)%",'\\1',df_raw[i,'g2'],perl = T)),
ifelse(str_detect(df_raw[i,'g3'],"G"),as.numeric(gsub(".* (.*)%",'\\1',df_raw[i,'g3'],perl = T)),
ifelse(str_detect(df_raw[i,'g4'],"G"),as.numeric(gsub(".* (.*)%",'\\1',df_raw[i,'g4'],perl = T)),0))))
df_raw[i,'A']=ifelse(str_detect(df_raw[i,'g1'],"A"),as.numeric(gsub(".* (.*)%",'\\1',df_raw[i,'g1'],perl = T)),
ifelse(str_detect(df_raw[i,'g2'],"A"),as.numeric(gsub(".* (.*)%",'\\1',df_raw[i,'g2'],perl = T)),
ifelse(str_detect(df_raw[i,'g3'],"A"),as.numeric(gsub(".* (.*)%",'\\1',df_raw[i,'g3'],perl = T)),
ifelse(str_detect(df_raw[i,'g4'],"A"),as.numeric(gsub(".* (.*)%",'\\1',df_raw[i,'g4'],perl = T)),0))))
df_raw[i,'T']=ifelse(str_detect(df_raw[i,'g1'],"T"),as.numeric(gsub(".* (.*)%",'\\1',df_raw[i,'g1'],perl = T)),
ifelse(str_detect(df_raw[i,'g2'],"T"),as.numeric(gsub(".* (.*)%",'\\1',df_raw[i,'g2'],perl = T)),
ifelse(str_detect(df_raw[i,'g3'],"T"),as.numeric(gsub(".* (.*)%",'\\1',df_raw[i,'g3'],perl = T)),
ifelse(str_detect(df_raw[i,'g4'],"T"),as.numeric(gsub(".* (.*)%",'\\1',df_raw[i,'g4'],perl = T)),0))))
df_raw[i,'C']=ifelse(str_detect(df_raw[i,'g1'],"C"),as.numeric(gsub(".* (.*)%",'\\1',df_raw[i,'g1'],perl = T)),
ifelse(str_detect(df_raw[i,'g2'],"C"),as.numeric(gsub(".* (.*)%",'\\1',df_raw[i,'g2'],perl = T)),
ifelse(str_detect(df_raw[i,'g3'],"C"),as.numeric(gsub(".* (.*)%",'\\1',df_raw[i,'g3'],perl = T)),
ifelse(str_detect(df_raw[i,'g4'],"C"),as.numeric(gsub(".* (.*)%",'\\1',df_raw[i,'g4'],perl = T)),0))))
}
df_raw_new=df_raw
df_raw_new[is.na(df_raw_new)] <- 0
max.col(df_raw_new[1:5,7:10])
for (i in 1:nrow(df_raw))
{
index=max.col(df_raw_new[i,7:10])+6
df_raw_new[i,'base']=colnames(df_raw_new)[index]
df_raw_new[i,'percentage']=max(df_raw_new[i,7:10])
}
df_raw_final=df_raw_new[df_raw_new$percentage>10,]
df_raw_final_seq=paste(df_raw_final$base,collapse = '')
df_raw_final_seq
write.table(df_raw_final_seq,file='L1PA3_consensus.seq',quote=F,row.names = F,col.names = F)
}
bedtools intersect -b LOVO_chromatin_hypo_peak.bed \
-a L1_filter_chr.bed -s -F 0.3 >LOVO_chromatin_hypo_peak_in_L1.bed
grep L1PA3 LOVO_chromatin_hypo_peak_in_L1.bed >LOVO_chromatin_hypo_peak_in_L1PA3.bed
grep L1PA4 LOVO_chromatin_hypo_peak_in_L1.bed >LOVO_chromatin_hypo_peak_in_L1PA4.bed
grep L1PA5 LOVO_chromatin_hypo_peak_in_L1.bed >LOVO_chromatin_hypo_peak_in_L1PA5.bed
grep L1PA7 LOVO_chromatin_hypo_peak_in_L1.bed >LOVO_chromatin_hypo_peak_in_L1PA7.bed
bedtools getfasta -fi Homo_sapiens.GRCh38.release94.dna.onlychr.fa -bed LOVO_chromatin_hypo_peak_in_L1PA3.bed -s > LOVO_chromatin_hypo_peak_in_L1PA3.fa
bedtools getfasta -fi Homo_sapiens.GRCh38.release94.dna.onlychr.fa -bed LOVO_chromatin_hypo_peak_in_L1PA4.bed -s > LOVO_chromatin_hypo_peak_in_L1PA4.fa
bedtools getfasta -fi Homo_sapiens.GRCh38.release94.dna.onlychr.fa -bed LOVO_chromatin_hypo_peak_in_L1PA5.bed -s > LOVO_chromatin_hypo_peak_in_L1PA5.fa
bedtools getfasta -fi Homo_sapiens.GRCh38.release94.dna.onlychr.fa -bed LOVO_chromatin_hypo_peak_in_L1PA7.bed -s > LOVO_chromatin_hypo_peak_in_L1PA7.fa
for i in ` ls LOVO_chromatin_hypo_peak_in_L1*bed `; do
name=`basename $i .bed`
findMotifsGenome.pl $i hg38 ${name}_nobg -rna -p 20 -len 5,6,7
done
test.query_sequences.fasta
>h-sg1-L1PA4-f
GGAGGAAGTTCGAACCCATGGCAAAGAAGT
>h-sg1-L1PA7-f
CAGGAGCACCCAGATTCATAAAGCAAGTTC
>h-sg2-L1PA7-f
CAGTCTCTCAGACCACAGTGCAATCAAATT
>h-sg1-L1PA3-f
CATCTCACGTGCAGAGACACACATAGGCTC
echo "=== Sequence Information ==="
grep -c "^>" test.query_sequences.fasta
awk '/^>/ {if (seq) print length(seq); seq=""; next} {seq = seq $0} END {if (seq) print length(seq)}' test.query_sequences.fasta | sort | uniq -c
samtools faidx Homo_sapiens.GRCh38.release94.dna.onlychr.fa
bwa index Homo_sapiens.GRCh38.release94.dna.onlychr.fa
bwa mem -a -v 1 Homo_sapiens.GRCh38.release94.dna.onlychr.fa test.query_sequences.fasta > sequences_aligned.sam 2> bwa.log
samtools view -bS sequences_aligned.sam | samtools sort -o sequences_aligned.bam
bedtools bamtobed -i sequences_aligned.bam > sequences_matches.bed
wc -l sequences_matches.bed
cut -f4 sequences_matches.bed | sort | uniq -c | sort -rn
bedtools intersect -a sequences_matches.bed -b L1_name_subfamily.bed -wa -wb > sequences_in_L1.bed
bedtools intersect -a sequences_matches.bed -b L1_name_subfamily.bed -v > sequences_notin_L1.bed
wc -l sequences_in_L1.bed
total_matches=$(wc -l < sequences_matches.bed)
in_l1_matches=$(wc -l < sequences_in_L1.bed)
echo "Overall: $in_l1_matches/$total_matches = $(echo "scale=2; $in_l1_matches*100/$total_matches" | bc)%"
awk 'BEGIN{print "Sequence\tTotal\tIn_L1\tPercentage"}
NR==FNR{total[$4]++}
NR!=FNR{inL1[$4]++}
END{for(seq in total){printf "%s\t%d\t%d\t%.1f%%\n", seq, total[seq], inL1[seq]+0, (inL1[seq]+0)*100/total[seq]}}' \
sequences_matches.bed sequences_in_L1.bed | sort -k4,4nr