Skip to content

Commit

Permalink
modified LINE1_db related scripts
Browse files Browse the repository at this point in the history
  • Loading branch information
friend1ws committed Apr 9, 2024
1 parent ee48861 commit cec73ed
Show file tree
Hide file tree
Showing 4 changed files with 71 additions and 8 deletions.
56 changes: 50 additions & 6 deletions LINE1_db/prep_line.sh
Original file line number Diff line number Diff line change
Expand Up @@ -7,24 +7,45 @@ then
wget http://hgdownload.soe.ucsc.edu/goldenPath/hg38/liftOver/hg38ToHg19.over.chain.gz
fi

if [ ! -f hg38-chm13v2.over.chain.gz ]
then
wget https://hgdownload.gi.ucsc.edu/hubs/GCA/009/914/755/GCA_009914755.4/liftOver/hg38-chm13v2.over.chain.gz
fi

if [ ! -f hg19ToHg38.over.chain.gz ]
then
wget http://hgdownload.soe.ucsc.edu/goldenPath/hg19/liftOver/hg19ToHg38.over.chain.gz
fi

if [ ! -f hg19-chm13v2.over.chain.gz ]
then
wget https://hgdownload.gi.ucsc.edu/hubs/GCA/009/914/755/GCA_009914755.4/liftOver/hg19-chm13v2.over.chain.gz
fi

##########

##########
# prepare repeat masker file
if [ ! -f rmsk.txt.gz ]
then
wget http://hgdownload.soe.ucsc.edu/goldenPath/hg38/database/rmsk.txt.gz
gzip -f chm13v2.0_RepeatMasker_4.1.2p1.2022Apr14.bed
fi

if [ ! -f chm13v2.0_RepeatMasker_4.1.2p1.2022Apr14.bed.gz ]
then
aws s3 cp s3://human-pangenomics/T2T/CHM13/assemblies/annotation/chm13v2.0_RepeatMasker_4.1.2p1.2022Apr14.bed ./
fi

python proc_rmsk.py rmsk.txt.gz > rmsk.line1.hg38.bed

liftOver rmsk.line1.hg38.bed hg38ToHg19.over.chain.gz rmsk.line1.hg19.bed.tmp rmsk.line1.unmapped

python mod_label.py rmsk.line1.hg19.bed.tmp > rmsk.line1.hg19.bed


python proc_rmsk_chm13.py chm13v2.0_RepeatMasker_4.1.2p1.2022Apr14.bed.gz > rmsk.line1.chm13v2.0.bed

#########


Expand All @@ -44,9 +65,12 @@ bcftools filter -i 'INFO/SVLEN > 5800 && INFO/SVTYPE == "LINE1"' ALL.wgs.mergedS

python proc_1000genomes.py 1000genomes.line1.hg19.vcf > 1000genomes.line1.hg19.bed

liftOver 1000genomes.line1.hg19.bed hg19ToHg38.over.chain.gz 1000genomes.line1.hg38.bed.tmp 1000genomes.line1.unmapped
liftOver 1000genomes.line1.hg19.bed hg19ToHg38.over.chain.gz 1000genomes.line1.hg38.bed.tmp 1000genomes.line1.hg38.unmapped
liftOver 1000genomes.line1.hg19.bed hg19-chm13v2.over.chain.gz 1000genomes.line1.chm13v2.0.bed.tmp 1000genomes.line1.chm13v2.0.unmapped

python mod_label.py 1000genomes.line1.hg38.bed.tmp > 1000genomes.line1.hg38.bed
python mod_label.py 1000genomes.line1.chm13v2.0.bed.tmp > 1000genomes.line1.chm13v2.0.bed

##########

##########
Expand All @@ -67,9 +91,12 @@ bcftools filter -i 'ALT == "<INS:ME:LINE1>" && INFO/SVLEN >= 5800' gnomad_v2.1_s

python proc_gnomad.py gnomad.line1.hg19.vcf > gnomad.line1.hg19.bed

liftOver gnomad.line1.hg19.bed hg19ToHg38.over.chain.gz gnomad.line1.hg38.bed.tmp gnomad.line1.unmapped
liftOver gnomad.line1.hg19.bed hg19ToHg38.over.chain.gz gnomad.line1.hg38.bed.tmp gnomad.line1.hg38.unmapped
liftOver gnomad.line1.hg19.bed hg19-chm13v2.over.chain.gz gnomad.line1.chm13v2.0.bed.tmp gnomad.line1.chm13v2.0.unmapped

python mod_label.py gnomad.line1.hg38.bed.tmp > gnomad.line1.hg38.bed
python mod_label.py gnomad.line1.chm13v2.0.bed.tmp > gnomad.line1.chm13v2.0.bed

##########

cat rmsk.line1.hg38.bed 1000genomes.line1.hg38.bed gnomad.line1.hg38.bed | sort -k1,1 -k2,2n -k3,3n > LINE1.hg38.bed
Expand All @@ -86,32 +113,49 @@ bgzip -c LINE1.hg19.bed > LINE1.hg19.bed.gz
tabix -p bed LINE1.hg19.bed.gz


cat rmsk.line1.chm13v2.0.bed 1000genomes.line1.chm13v2.0.bed gnomad.line1.chm13v2.0.bed | sort -k1,1 -k2,2n -k3,3n > LINE1.chm13v2.0.bed

bgzip -c LINE1.chm13v2.0.bed > LINE1.chm13v2.0.bed.gz

tabix -p bed LINE1.chm13v2.0.bed.gz


mv LINE1.hg38.bed.gz ../nanomonsv/data/
mv LINE1.hg38.bed.gz.tbi ../nanomonsv/data/

mv LINE1.hg19.bed.gz ../nanomonsv/data/
mv LINE1.hg19.bed.gz.tbi ../nanomonsv/data/

mv LINE1.chm13v2.0.bed.gz ../nanomonsv/data/
mv LINE1.chm13v2.0.bed.gz.tbi ../nanomonsv/data/

##########

rm -rf rmsk.line1.hg38.bed
rm -rf rmsk.line1.hg19.bed
rm -rf rmsk.line1.hg19.bed.tmp
rm -rf rmsk.line1.unmapped
rm -rf rmsk.line1.chm13v2.0.bed

rm -rf 1000genomes.line1.hg19.vcf
rm -rf 1000genomes.line1.hg19.bed
rm -rf 1000genomes.line1.hg38.bed
rm -rf 1000genomes.line1.hg38.bed.tmp
rm -rf 1000genomes.line1.unmapped
rm -rf 1000genomes.line1.hg38.unmapped
rm -rf 1000genomes.line1.chm13v2.0.bed
rm -rf 1000genomes.line1.chm13v2.0.bed.tmp
rm -rf 1000genomes.line1.chm13v2.0.unmapped

rm -rf gnomad.line1.hg19.vcf
rm -rf gnomad.line1.hg38.bed
rm -rf gnomad.line1.hg38.bed.tmp
rm -rf gnomad.line1.hg38.unmapped
rm -rf gnomad.line1.hg19.bed
rm -rf gnomad.line1.unmapped
rm -rf gnomad.line1.chm13v2.0.bed
rm -rf gnomad.line1.chm13v2.0.bed.tmp
rm -rf gnomad.line1.chm13v2.0.unmapped

rm -rf LINE1.hg38.bed
rm -rf LINE1.hg19.bed


rm -rf LINE1.chm13v2.0.bed

3 changes: 2 additions & 1 deletion LINE1_db/proc_1000genomes.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,8 @@
lid = record.id
lstrand = record.info["MEINFO"][3]
label = ','.join([lchr, lstart, lend, lstrand, lid])
laf = float(record.info["AF"][0])
laf = 0
# laf = float(record.info["AF"][0])
# if laf < 0.01: continue

print('\t'.join([lchr, lstart, lend, label, str(laf), lstrand]))
Expand Down
3 changes: 2 additions & 1 deletion LINE1_db/proc_gnomad.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,8 @@
# lstrand = record.info["MEINFO"][3]
lstrand = '*'
label = ','.join([lchr, lstart, lend, lstrand, lid])
laf = float(record.info["AF"][0])
laf = 0
# laf = float(record.info["AF"][0])
# if laf < 0.01: continue

print('\t'.join([lchr, lstart, lend, label, str(laf), lstrand]))
Expand Down
17 changes: 17 additions & 0 deletions LINE1_db/proc_rmsk_chm13.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
#! /usr/bin/env python

import sys, gzip
import pysam

rmsk_file = sys.argv[1]

with gzip.open(rmsk_file, 'rt') as hin:
for line in hin:
F = line.rstrip('\n').split('\t')
# if len(F[5]) > 5: continue
if int(F[2]) - int(F[1]) < 5800: continue
if F[7] != "L1": continue
if not F[3] in ["L1HS", "L1PA2", "L1PA3", "L1PA4", "L1PA5"]: continue
label = ','.join([F[0], F[1], F[2], F[5], F[3]])
print('\t'.join([F[0], F[1], F[2], label, '0', F[5]]))

0 comments on commit cec73ed

Please sign in to comment.