Scripts for processing WES data from Colorectal Cancer Metastasis
copy "align.sh" to working directory
for name in `ls -d *[0]`
do
dna_analysis.sh -n $name -t 16 -s fastq -e reCal -1 $fq1 -2 $fq2
donecopy "filter.sh" to mutect2/filteredVcf directory
for i in ../rawVcf/*.vcf.gz
do
name=`basename $i .vcf.gz`
./filter.sh $name $i ../../contamination/$name.calculatecontamination.table ../../ffpe/$name.artifact.pre_adapter_detail_metrics
donefor i in BXY DDY DWY LDS WLF YHD
do
cd $i
for j in *.vcf
do
convert2annovar.pl --format vcf4old $j 2>/dev/null |cut -f 1-5 >>$i.raw.bed
done
sort -V $i.raw.bed|uniq >$i.sorted.uniq.bed
cd ..
done for i in */*.somatic_hardfiltered.clean.vcf
do
nohup gatk HaplotypeCaller -L `dirname $i`/`dirname $i`.sorted.uniq.bed --reference /database/ref/hg19.fa --input ../bamFiles/`basename $i .somatic_hardfiltered.clean.vcf`.sorted.dedup.rg.recal.bam --output `dirname $i`/`basename $i .somatic_hardfiltered.clean.vcf`.g.vcf -ERC GVCF --output-mode EMIT_ALL_SITES >log.`basename $i .somatic_hardfiltered.clean.vcf` &
donefor i in BXY DDY DWY LDS WLF YHD
do
for i in $i/*.g.vcf;do echo " -V "$i;done|xargs gatk CombineGVCFs -R /database/ref/hg19.fa -O $i/${i}.g.vcf.gz
gatk GenotypeGVCFs -L ${i}/${i}.sorted.uniq.bed -R /database/ref/hg19.fa -V ${i}/${i}.g.vcf.gz -O ${i}/${i}.jointlycall.vcf.gz
donefor i in BXY DDY DWY LDS WLF YHD
do
table_annovar.pl ${i}/${i}.jointlycall.vcf.gz /database/annotation/annovar/humandb/ -buildver hg19 -out ${i}/${i} -remove -protocol refGene,cytoBand -operation g,r -nastring . -vcfinput
gatk VariantsToTable -V ${i}/${i}.hg19_multianno.vcf -F cytoBand -F POS -F REF -F ALT -F Func.refGene -F Gene.refGene -GF AD -O ${i}/${i}.table
awk 'BEGIN{OFS="\t"} {if(NR==1){printf "Chromosome\tPosition\tChange\tGene";for(i=7;i<=NF;i++){split($i,m,".");printf "\t"m[1]}printf "\n"}else{ if ($1 != "." && ($5 ~/^exonic$/ || $5 ~/^splicing$/ ) && $NF ~ /,0$/ ){printf "chr"$1"\t"$2"\t"$3">"$4"\t"$6;for(i=7;i<=NF;i++){split($i,m,",");printf "\t"m[2]}printf "\n"}}}' ${i}/${i}.table >${i}/${i}.mutatedRead.txt
awk 'BEGIN{OFS="\t"} {if(NR==1){printf "Chromosome\tPosition\tChange\tGene";for(i=7;i<=NF;i++){split($i,m,".");printf "\t"m[1]}printf "\n"}else{ if ($1 != "." && ($5 ~/^exonic$/ || $5 ~/^splicing$/ ) && $NF ~ /,0$/ ){printf "chr"$1"\t"$2"\t"$3">"$4"\t"$6;for(i=7;i<=NF;i++){split($i,m,",");printf "\t"(m[1]+m[2])}printf "\n"}}}' ${i}/${i}.table >${i}/${i}.coverage.txt
cp $i/$i.coverage.txt ~/treeomics/src/input/$i/
cp $i/$i.mutatedRead.txt ~/treeomics/src/input/$i/
doneShould run the initial two steps of treeomics input preparation to get table files
ONLY exonic and splicing variants reatined, sites on chrM or with alterantive reads in NORMAL sample were removed
for i in BXY DDY DWY LDS WLF YHD
do
awk 'BEGIN{OFS="\t"}{ if(NR==1){for(i=7;i<=NF;i++){split($i,n,".");f[i]=n[1]; printf "" >f[i]".txt"}} else{ split($1,h,"[pq]"); for(i=7;i<=NF;i++){ split($i,m,","); if($1 != "." && ($5 ~/^exonic$/ || $5 ~/^splicing$/) && length(m)==2 && $NF ~ /,0$/ ){printf "chr"h[1]"\t"($2-1)"\t"$2"\tchr"h[1]":"$2":"$3":"$4"\t"m[1]"\t"m[2]"\t2\t0\t-""\n" >>f[i]".txt" } } } }' $i.table
donefor i in *.seg
do
# the copy number MUST be integrate
intersectBed -a `basename $i .seg`.txt -b $i -wb |awk 'BEGIN{OFS="\t"}{if(NR==1){print "mutation_id\tref_counts\tvar_counts\tnormal_cn\tminor_cn\tmajor_cn\n"$4,$5,$6,$7,$8,int(2^$14 * 2 +0.5)}else{print $4,$5,$6,$7,$8,int(2^$14 * 2 +0.5)}}' - >`basename $i .seg`.tsv
done
# Missing normal files in the above calculation, create tsv files for normal samples
for i in *NOM*.txt
do
awk 'BEGIN{OFS="\t"}{print $4,$5,$6,$7,$8,2}' $i |sed "1imutation_id\tref_counts\tvar_counts\tnormal_cn\tminor_cn\tmajor_cn">`basename $i .txt`.tsv
doneCreate the config file maunally according the demo
# activate the vitrual python enviorment
source activate python27
# build the yaml files
for i in ../tsv/*.tsv
do
PyClone build_mutations_file --prior total_copy_number --in_file $i --out_file `basename $i .tsv`.yaml
done
# run the main program of PyClone
for i in bxy ddy dwy lds wlf yhd
do
mkdir trace_${i}
PyClone run_analysis --config_file config.${i}.yaml --outfile trace_${i} --seed 1986
PyClone build_table --config_file config.${i}.yaml --outfile ${i}.table --table_type loci
doneUse the tsv files created above; Use homemake script intersect
for i in *.tsv
do
name=`basename $i .tsv`
# Prepare the annotation input files
tail -n +2 $i |awk -F "[:\t]" 'BEGIN{OFS="\t"}{print $1,$2-length($3)+1,$2,$3,$4,$0}' - >$name.avinput
# Annotation
annotate_variation.pl -buildver hg19 -outfile $name $name.avinput /database/annotation/annovar/humandb
# Remove synonymous variants
awk '$2~/^synonymous/' $name.exonic_variant_function >$name.tmp
intersect -c 8 9 -r $name.variant_function -f $name.variant_function $name.tmp -o 1 >$name.txt
rm $name.tmp
# Create tsv files (should create tsv folder in case overlap original tsv)
mkdir tsv
sort -k2 $name.txt |awk 'BEGIN{OFS="\t"}{if($2~/\(/){sub(/\(\S+\)/,"",$2)};m[$2]=m[$2]+$9;n[$2]=n[$2]+$10;h[$2]=h[$2]+$13;a[$2]=a[$2]+1}END{for(k in m){print k,m[k],n[k],2,0,int(h[k]/a[k])}}' - >tsv/$name.tsv
done