軟件安裝
unzip ~/software/gatk-4.4.0.0.zip -d /opt/biosoft
echo 'PATH=$PATH:/opt/biosoft/gatk-4.4.0.0/' >> ~/.bashrc
source ~/.bashrc
mkdir -p /home/train/07.variants_calling
cd /home/train/07.variants_calling
# 準備輸入的BAM文件和參考基因組文件
ln -s ~/06.reads_aligment/bowtie2/*.sam ./
samtools sort -@ 4 -O BAM -o V1.bam V1.sam
samtools sort -@ 4 -O BAM -o V2.bam V2.sam #準備bam文件
java -jar /opt/biosoft/picard-tools/picard.jar MarkDuplicates I=V1.bam O=V1.MD.bam M=V1.metrics #上機之前有由于PCR擴增導致序列重復,故進行標記重復操作
java -jar /opt/biosoft/picard-tools/picard.jar MarkDuplicates I=V2.bam O=V2.MD.bam M=V2.metrics
samtools index V1.MD.bam
samtools index V2.MD.bam
ln -s ~/00.incipient_data/data_for_genome_assembling/assemblies_of_Malassezia_sympodialis/Malassezia_sympodialis.genome_V01.fasta genome.fasta
samtools faidx genome.fasta #準備基因組文件
java -jar /opt/biosoft/picard-tools/picard.jar CreateSequenceDictionary R=genome.fasta O=genome.dict
# 使用 GATK HaplotypeCaller 進行 SNP/InDel calling
mkdir -p /home/train/07.variants_calling/GATK
cd /home/train/07.variants_calling/GATK
# 準備 bam 文件和基因組文件
ln -s ../*bam* .
ln -s ../genome.* .
[train@MiWiFi-R3P-srv GATK]$ ln -s ../*bam* .
ln -s ../genome.* .
[train@MiWiFi-R3P-srv GATK]$ ls
genome.dict genome.fasta genome.fasta.fai V1.bam V1.MD.bam V1.MD.bam.bai V2.bam V2.MD.bam V2.MD.bam.bai
# 運行HaplotypeCaller分別對單個樣品進行variants分析
gatk HaplotypeCaller -R genome.fasta -I V1.MD.bam -ERC GVCF -O V1.g.vcf --pcr-indel-model CONSERVATIVE --sample-ploidy 2 --min-base-quality-score 10 --kmer-size 10 --kmer-size 25
# real 2m13.747s
# user 4m26.141s
# sys 0m1.111s
#遇到報錯
[train@MiWiFi-R3P-srv GATK]$ gatk HaplotypeCaller -R genome.fasta -I V1.MD.bam -ERC GVCF -O V1.g.vcf --pcr-indel-model CONSERVATIVE --sample-ploidy 2 --min-base-quality-score 10 --kmer-size 10 --kmer-size 25
Using GATK jar /opt/biosoft/gatk-4.4.0.0/gatk-package-4.4.0.0-local.jar
Running:
java -Dsamjdk.use_async_io_read_samtools=false -Dsamjdk.use_async_io_write_samtools=true -Dsamjdk.use_async_io_write_tribble=false -Dsamjdk.compression_level=2 -jar /opt/biosoft/gatk-4.4.0.0/gatk-package-4.4.0.0-local.jar HaplotypeCaller -R genome.fasta -I V1.MD.bam -ERC GVCF -O V1.g.vcf --pcr-indel-model CONSERVATIVE --sample-ploidy 2 --min-base-quality-score 10 --kmer-size 10 --kmer-size 25
Error: LinkageError occurred while loading main class org.broadinstitute.hellbender.Main
java.lang.UnsupportedClassVersionError: org/broadinstitute/hellbender/Main has been compiled by a more recent version of the Java Runtime (class file version 61.0), this version of the Java Runtime only recognizes class file versions up to 55.0
#解決方法
[train@MiWiFi-R3P-srv GATK]$ which java
/usr/bin/java
[train@MiWiFi-R3P-srv GATK]$ ls /opt/sysoft/j
jdk-20.0.1/ jre1.7.0_05/ jre1.8.0_371/
[train@MiWiFi-R3P-srv GATK]$ ls /opt/sysoft/jdk-20.0.1/bin/
jar java javadoc jcmd jdb jdeps jhsdb jinfo jmap jpackage jrunscript jstack jstatd keytool serialver
jarsigner javac javap jconsole jdeprscan jfr jimage jlink jmod jps jshell jstat jwebserver rmiregistry
[train@MiWiFi-R3P-srv GATK]$ echo 'PATH=/opt/sysoft/jdk-20.0.1/bin/:$PATH' >> ~/.bashrc
[train@MiWiFi-R3P-srv GATK]$ source ~/.bashrc
[train@MiWiFi-R3P-srv GATK]$ which java
/opt/sysoft/jdk-20.0.1/bin/java
gatk HaplotypeCaller -R genome.fasta -I V2.MD.bam -ERC GVCF -O V2.g.vcf --pcr-indel-model CONSERVATIVE --sample-ploidy 2 --min-base-quality-score 10 --kmer-size 10 --kmer-size 25
# real 2m26.303s
# user 5m44.855s
# sys 0m1.106s
#多樣品計算
[train@MiWiFi-R3P-srv ~]$ for i in `ls *.MD.bam`
> do
> i=${i/.MD.bam/}
> echo "gatk HaplotypeCaller -R genome.fasta -I $i.MD.bam -ERC GVCF -O $i.g.vcf --pcr-indel-model CONSERVATIVE --sample-ploidy 2 --min-base-quality-score 10 --kmer-size 10 --kmer-size 2"
> done > command.gatk_HaplotupeCaller.list
[train@MiWiFi-R3P-srv ~]$ ParaFly -c command.gatk_HaplotupeCaller.list -CPU 20
#可以使用samtools對變異位點進行觀察
samtools tview V1.MD.bam genome.fasta
# 運行CombineGVCFs將多個GVCF文件進行整合
gatk CombineGVCFs -R genome.fasta -O combined.g.vcf -V V1.g.vcf -V V2.g.vcf
# real 0m16.789s
# user 0m33.000s
# sys 0m0.811s
#可以批量進行
[train@MiWiFi-R3P-srv GATK]$ ls V?.g.vcf | perl -pe 's/^/-V /' | perl -pe 's/\n/ /'
-V V1.g.vcf -V V2.g.vcf
[train@MiWiFi-R3P-srv GATK]$ gatk CombineGVCFs -R genome.fasta -O combined.g.vcf `ls V?.g.vcf | perl -pe 's/^/-V /' | perl -pe 's/\n/ /'`
Using GATK jar /opt/biosoft/gatk-4.4.0.0/gatk-package-4.4.0.0-local.jar
Running:
java -Dsamjdk.use_async_io_read_samtools=false -Dsamjdk.use_async_io_write_samtools=true -Dsamjdk.use_async_io_write_tribble=false -Dsamjdk.compression_level=2 -jar /opt/biosoft/gatk-4.4.0.0/gatk-package-4.4.0.0-local.jar CombineGVCFs -R genome.fasta -O combined.g.vcf -V V1.g.vcf -V V2.g.vcf
09:24:04.472 INFO NativeLibraryLoader - Loading libgkl_compression.so from jar:file:/opt/biosoft/gatk-4.4.0.0/gatk-package-4.4.0.0-local.jar!/com/intel/gkl/native/libgkl_compression.so
09:24:04.511 INFO CombineGVCFs - ------------------------------------------------------------
09:24:04.514 INFO CombineGVCFs - The Genome Analysis Toolkit (GATK) v4.4.0.0
09:24:04.514 INFO CombineGVCFs - For support and documentation go to https://software.broadinstitute.org/gatk/
09:24:04.514 INFO CombineGVCFs - Executing as train@MiWiFi-R3P-srv on Linux v5.14.0-284.11.1.el9_2.x86_64 amd64
09:24:04.515 INFO CombineGVCFs - Java runtime: Java HotSpot(TM) 64-Bit Server VM v20.0.1+9-29
09:24:04.515 INFO CombineGVCFs - Start Date/Time: July 21, 2023, 9:24:04?AM CST
09:24:04.515 INFO CombineGVCFs - ------------------------------------------------------------
09:24:04.515 INFO CombineGVCFs - ------------------------------------------------------------
09:24:04.516 INFO CombineGVCFs - HTSJDK Version: 3.0.5
09:24:04.516 INFO CombineGVCFs - Picard Version: 3.0.0
09:24:04.516 INFO CombineGVCFs - Built for Spark Version: 3.3.1
09:24:04.516 INFO CombineGVCFs - HTSJDK Defaults.COMPRESSION_LEVEL : 2
09:24:04.516 INFO CombineGVCFs - HTSJDK Defaults.USE_ASYNC_IO_READ_FOR_SAMTOOLS : false
09:24:04.516 INFO CombineGVCFs - HTSJDK Defaults.USE_ASYNC_IO_WRITE_FOR_SAMTOOLS : true
09:24:04.517 INFO CombineGVCFs - HTSJDK Defaults.USE_ASYNC_IO_WRITE_FOR_TRIBBLE : false
09:24:04.517 INFO CombineGVCFs - Deflater: IntelDeflater
09:24:04.517 INFO CombineGVCFs - Inflater: IntelInflater
09:24:04.517 INFO CombineGVCFs - GCS max retries/reopens: 20
09:24:04.517 INFO CombineGVCFs - Requester pays: disabled
09:24:04.517 INFO CombineGVCFs - Initializing engine
09:24:04.614 INFO FeatureManager - Using codec VCFCodec to read file file:///home/train/07.variants_calling/GATK/V1.g.vcf
09:24:04.636 INFO FeatureManager - Using codec VCFCodec to read file file:///home/train/07.variants_calling/GATK/V2.g.vcf
09:24:04.713 INFO CombineGVCFs - Done initializing engine
09:24:04.731 INFO ProgressMeter - Starting traversal
09:24:04.732 INFO ProgressMeter - Current Locus Elapsed Minutes Variants Processed Variants/Minute
09:24:04.804 WARN ReferenceConfidenceVariantContextMerger - Detected invalid annotations: When trying to merge variant contexts at location MS01Contig01:1208 the annotation MLEAC=[1, 0] was not a numerical value and was ignored
09:24:14.734 INFO ProgressMeter - MS01Contig04:576896 0.2 1056000 6336000.0
09:24:21.235 INFO ProgressMeter - MS01Contig11:54282 0.3 1884473 6851383.4
09:24:21.235 INFO ProgressMeter - Traversal complete. Processed 1884473 total variants in 0.3 minutes.
09:24:21.251 INFO CombineGVCFs - Shutting down engine
[July 21, 2023, 9:24:21?AM CST] org.broadinstitute.hellbender.tools.walkers.CombineGVCFs done. Elapsed time: 0.28 minutes.
Runtime.totalMemory()=220200960
# 運行GenotypeGVCFs鑒定joint-called variants
gatk GenotypeGVCFs -R genome.fasta -O variants.raw.vcf -V combined.g.vcf --sample-ploidy 2
# real 0m14.088s
# user 0m25.331s
# sys 0m0.668s
# 運行VariantFiltration對variants結果進行hard filtering。理論上,更好的filtering方法是根據已有的準確的variants位點,通過機器學習的方法來進行variant quality score recalibration (VQSR)。該方法使用的命令是VariantRecalibrator。適用與大樣本數,大數據,不適合小樣本,簡化基因組測序和轉錄組測序等。
gatk VariantFiltration -R genome.fasta -O variants.filter.vcf -V variants.raw.vcf --filter-name FilterQual --filter-expression "QUAL < 30.0" --filter-name FilterQD --filter-expression "QD < 13.0" --filter-name FilterMQ --filter-expression "MQ < 20.0" --filter-name FilterFS --filter-expression "FS > 20.0" --filter-name FilterMQRankSum --filter-expression "MQRankSum < -3.0" --filter-name FilterReadPosRankSum --filter-expression "ReadPosRankSum < -3.0" --filter-name FilterBaseQRankSum --filter-expression "BaseQRankSum < -3.0"
#--filter-expression "QUAL < 30.0" #mapping質量直小于30
#--filter-expression "QD < 13.0" #過濾基因組重復序列區域
#--filter-expression "FS > 20.0" #剔除鏈的偏好性位點
#--filter-expression "ReadPosRankSum < -3.0" #偏離平均值過多的位點,不符合正態分布的位點
--filter-name FilterMQRankSum #篩選掉與參考基因組不一致的位點
#-filter-name FilterBaseQRankSum #篩選掉與參考
# real 0m3.596s
# user 0m5.314s
# sys 0m0.233s
[train@MiWiFi-R3P-srv GATK]$ grep -v "#" variants.filter.vcf |wc -l
7632
雜合率=7632/基因組大小
獲得snp位點和indel的vcf
[train@MiWiFi-R3P-srv GATK]$ gatk SelectVariants -V variants.raw.vcf -select-type SNP -O snp.vcf
[train@MiWiFi-R3P-srv GATK]$ gatk SelectVariants -V variants.raw.vcf -select-type INDEL -O indel.vcf
grep -v -P "\tFilter" variants.filter.vcf > variants.vcf
根據INDEL信息批量設計引物
mkdir -p /home/train/07.variants_calling/INDEL_primer_design
cd /home/train/07.variants_calling/INDEL_primer_design
ln -s ../genome.* .
gatk SelectVariants -R genome.fasta -V ../GATK/variants.vcf -O INDELs.vcf --select-type-to-include INDEL
ln -s ~/05.genome_feature_analysis/SSR_detecting_and_primer_design/p3_settings_file .
VCF_InDel_primer3.pl INDELs.vcf genome.fasta --CPU 4 --p3_setting_file p3_settings_file --gff3_out VCF_InDel_primer3.gff3 > VCF_InDel_primer3.out
# real 3m0.606s
# user 21m44.174s
# sys 0m0.546s
SnpEff注釋
軟件安裝
# installing snpEff (http://snpeff.sourceforge.net/)
#wget https://snpeff.blob.core.windows.net/versions/snpEff_latest_core.zip -P ~/software/
unzip ~/software/snpEff_latest_core.zip -d /opt/biosoft/
gff3ToGtf.pl程序在下面這個軟件里
[train@MiWiFi-R3P-srv SnpEff]$ tar zxf ~/software/geta-2.6.1.tar.gz -C /opt/biosoft/
[train@MiWiFi-R3P-srv SnpEff]$ echo 'PATH=$PATH:/opt/biosoft/geta-2.6.1/bin' >> ~/.bashrc
[train@MiWiFi-R3P-srv SnpEff]$ source ~/.bashrc
mkdir -p /home/train/07.variants_calling/SnpEff
cd /home/train/07.variants_calling/SnpEff
#構建SnpEff數據庫
mkdir -p /opt/biosoft/snpEff/data/malassezia_sympodialis/
cp ~/00.incipient_data/data_for_genome_assembling/assemblies_of_Malassezia_sympodialis/Malassezia_sympodialis.genome_V01.fasta /opt/biosoft/snpEff/data/malassezia_sympodialis/sequences.fa #準備基因組文件
gff3_remove_UTR.pl /home/train/00.incipient_data/data_for_gene_prediction_and_RNA-seq/Malassezia_sympodialis_V01.bestGeneModels.gff3 > Malassezia_sympodialis.gff3 #準備gff3文件,去除utr區域,大型動植物基因組也可以不用去除
gff3ToGtf.pl ~/00.incipient_data/data_for_genome_assembling/assemblies_of_Malassezia_sympodialis/Malassezia_sympodialis.genome_V01.fasta Malassezia_sympodialis.gff3 > /opt/biosoft/snpEff/data/malassezia_sympodialis/genes.gtf
perl -p -i -e 's/^\s*$//' /opt/biosoft/snpEff/data/malassezia_sympodialis/genes.gtf #標準的gtf沒有空行,故將其去除
cp /home/train/00.incipient_data/data_for_gene_prediction_and_RNA-seq/Malassezia_sympodialis_V01.CDS.fasta /opt/biosoft/snpEff/data/malassezia_sympodialis/cds.fa
cp /home/train/00.incipient_data/data_for_gene_prediction_and_RNA-seq/Malassezia_sympodialis_V01.protein.fasta /opt/biosoft/snpEff/data/malassezia_sympodialis/protein.fa
echo "malassezia_sympodialis.genome : malassezia sympodialis" >> /opt/biosoft/snpEff/snpEff.config
java -jar /opt/biosoft/snpEff/snpEff.jar build -c /opt/biosoft/snpEff/snpEff.config -gtf22 -v malassezia_sympodialis
#運行SnpEff注釋程序
java -Xmx2G -jar /opt/biosoft/snpEff/snpEff.jar eff -csvStats variants.SnpEff.csv -s variants.SnpEff.html -c /opt/biosoft/snpEff/snpEff.config -v -ud 500 malassezia_sympodialis ~/00.incipient_data/data_for_variants_calling/variants.vcf > variant.SnpEff.vcf
# real 0m6.132s
# user 0m11.224s
# sys 0m0.369s
[train@MiWiFi-R3P-srv SnpEff]$ ls
Malassezia_sympodialis.genome_V01.fasta Malassezia_sympodialis.gff3 variant.SnpEff.vcf variants.SnpEff.csv variants.SnpEff.genes.txt variants.SnpEff.html
[train@MiWiFi-R3P-srv SnpEff]$ firefox variants.SnpEff.html
如何去除indel位點?gatk
如何統計某個變異位點頻率?https://blog.csdn.net/upyours00/article/details/106910961
篩選連鎖不平衡位點?plink