NGS_scripts

Scripts for analyzing Next-Generation Sequencing data

Requirements

Perl - https://www.perl.org
R - http://www.r-project.org
Perl modules "Bio::DB::Fasta", "Bio::DB::Taxonomy" - https://bioperl.org
wget - https://www.gnu.org/software/wget/
Linux commands: sort, gzip, ...

Install

If you already have Git (https://git-scm.com) installed, you can get the latest development version using Git.

git clone https://github.com/jiwoongbio/NGS_scripts.git

Human RNA-seq data analysis

Setting reference genome data

# Requirements
# 1. Annomen - https://github.com/jiwoongbio/Annomen
# 2. BWA - http://bio-bwa.sourceforge.net
# 3. Samtools - http://www.htslib.org
# 4. STAR - https://github.com/alexdobin/STAR
#
# - You can use conda to install the requirements as follows:
# conda create -n Annomen -c bioconda perl perl-bioperl emboss
# conda install -n Annomen -c anaconda wget
# conda install -n Annomen -c bioconda bwa samtools star

git clone https://github.com/jiwoongbio/Annomen.git
mv Annomen Annomen.hg38
cd Annomen.hg38

# Generate genome annotation table
time ./Annomen_table.hg38.sh

#PICARD=where/is/picard.jar
time java -jar $PICARD CreateSequenceDictionary REFERENCE=genome.fasta

# Generate genome index
time bwa index genome.fasta
time samtools faidx genome.fasta

# Calculate genome sequence lengths
time fasta.length.pl genome.fasta > genome.length.txt

# Calculate refseq transcript sequence lengths
time fasta.length.pl refseq.transcript.fasta > refseq.transcript.length.txt

# List gene / transcript pairs
time perl gff_extract.pl -E */*_genomic.gff.gz gene transcript_id | table.search.pl refseq.transcript.length.txt 0 - 1 | sed -r 's/_([0-9]+\.[0-9]+)/\t\1/' | sort -t $'\t' -k1,1 -k2,2 -k3,3n | uniq | awk -F'\t' -vOFS='\t' '{print $1, $2"_"$3}' > gene.transcript.txt

# Extract transcript sequences with gene annotation
time sed 's/\t/ /g' refseq.transcript.fasta | tr '\n' '\t' | sed 's/\t$/\n/' | sed 's/\t>/\n>/g' | sed 's/^>//' | sed 's/ /\t/' | table.search.pl gene.transcript.txt 1 - 0 | sed 's/\t/ /' | sed 's/^/>/' | sed 's/\t/\n/g' > transcript.fasta

# Generate transcript index
time bwa index transcript.fasta
time samtools faidx transcript.fasta

# Calculate transcript sequence lengths
time fasta.length.pl transcript.fasta > transcript.length.txt

# List rRNA
time sed 's/\t/ /g' transcript.fasta | sed -n 's/^>//p' | sed 's/ /\t/' | sed -r 's/, ([^,]*)$/\t\1/' | awk -F'\t' '($3 == "ribosomal RNA" || $3 == "rRNA")' > rRNA.txt

# Fix GTF file
time gzip -dc */*_genomic.gtf.gz | grep -v '^#' | table.substitute_value.pl -i 0 -f chromosome.UCSC.txt -o - | table.search.pl genome.length.txt 0 - 0 > genome.gtf

# Generate STAR index
rm -rf STAR; mkdir STAR; time STAR --runThreadN 16 --runMode genomeGenerate --genomeDir STAR --genomeFastaFiles genome.fasta --sjdbGTFfile genome.gtf

# SpliceFisher
time git clone https://github.com/jiwoongbio/SpliceFisher.git
rm SpliceFisher/*.bam
rm SpliceFisher/*.txt

time (cd SpliceFisher; ./prepare.sh ../genome.gtf)
time (cd SpliceFisher; perl region_type.pl ../genome.gtf > region_type.txt)
time (cd SpliceFisher; perl region_type.pl -S forward ../genome.gtf | awk -F'\t' -vOFS='\t' '{print $1, $2, $3, "+", $4}' > region_type.forward.txt)
time (cd SpliceFisher; perl region_type.pl -S reverse ../genome.gtf | awk -F'\t' -vOFS='\t' '{print $1, $2, $3, "-", $4}' > region_type.reverse.txt)

# Generate gene + transcript index
time cat genome.fasta transcript.fasta > genome_transcript.fasta
time bwa index genome_transcript.fasta
time samtools faidx genome_transcript.fasta

# List gene / gene ID pairs
time perl gff_extract.pl -E */*_genomic.gff.gz gene Dbxref | table.delimitLines.pl - 1 | sed -n 's/GeneID://p' | sort -u > gene.gene_id.txt

# List gene / representative transcript pairs
time perl gff_extract.pl -E */*_genomic.gff.gz transcript_id tag | awk -F'\t' '($2 == "MANE Select")' | table.search.pl - 0 gene.transcript.txt 1 > gene.transcript.MANE_Select.txt
time perl gff_extract.pl -E */*_genomic.gff.gz transcript_id tag | awk -F'\t' '($2 == "RefSeq Select")' | table.search.pl - 0 gene.transcript.txt 1 > gene.transcript.RefSeq_Select.txt

time perl gff_extract.pl -E */*_genomic.gff.gz transcript_id tag | awk -F'\t' '($2 == "MANE Plus Clinical")' | table.search.pl - 0 gene.transcript.txt 1 > gene.transcript.MANE_Plus_Clinical.txt
time perl gff_extract.pl -E */*_genomic.gff.gz transcript_id tag | awk -F'\t' '($2 == "RefSeq Plus Clinical")' | table.search.pl - 0 gene.transcript.txt 1 > gene.transcript.RefSeq_Plus_Clinical.txt

time table.search.pl -v gene.transcript.MANE_Select.txt 0,1 gene.transcript.RefSeq_Select.txt 0,1 gene.transcript.MANE_Plus_Clinical.txt 0,1 gene.transcript.RefSeq_Plus_Clinical.txt 0,1 | cat gene.transcript.MANE_Select.txt - > gene.transcript.select.txt

# dbSNP 155
time wget --no-verbose --no-check-certificate https://ftp.ncbi.nlm.nih.gov/snp/archive/b155/VCF/GCF_000001405.39.gz
time gzip -dc GCF_000001405.39.gz | grep -v '^#' | table.substitute_value.pl -i 0 -f chromosome.UCSC.txt -o - | table.search.pl genome.length.txt 0 - 0 | bash -c "cat <(gzip -dc GCF_000001405.39.gz | head -n1000 | grep '^#') -" | perl leftalignIndel.pl - genome.fasta | perl sort_by_reference.pl - genome.fasta 0 1 | bgzip > snp_b155.vcf.gz
time tabix --preset vcf snp_b155.vcf.gz

# dbSNP 156
time wget --no-verbose --no-check-certificate https://ftp.ncbi.nlm.nih.gov/snp/archive/b156/VCF/GCF_000001405.40.gz
time gzip -dc GCF_000001405.40.gz | grep -v '^#' | table.substitute_value.pl -i 0 -f chromosome.UCSC.txt -o - | table.search.pl genome.length.txt 0 - 0 | bash -c "cat <(gzip -dc GCF_000001405.40.gz | head -n1000 | grep '^#') -" | perl leftalignIndel.pl - genome.fasta | perl sort_by_reference.pl - genome.fasta 0 1 | bgzip > snp_b156.vcf.gz
time tabix --preset vcf snp_b156.vcf.gz

Analyzing a RNA-seq sample

mkdir SAMPLE
ln -sf RAW_DATA/SAMPLE_R1.fastq.gz SAMPLE/SAMPLE.1.fastq.gz
ln -sf RAW_DATA/SAMPLE_R2.fastq.gz SAMPLE/SAMPLE.2.fastq.gz

generate_scripts.RNAseq.pl SAMPLE

sbatch.pl -p PARTITION_OF_32GB_FREE_RAM SAMPLE

Name		Name	Last commit message	Last commit date
Latest commit History 24 Commits
NCBI		NCBI
CPM.pl		CPM.pl
README.md		README.md
RPKM.pl		RPKM.pl
add_zero_depth.pl		add_zero_depth.pl
aldex_wilcox.R		aldex_wilcox.R
aligned_sequence.pl		aligned_sequence.pl
alignment.position_base.pl		alignment.position_base.pl
ancombc.R		ancombc.R
antismash.cluster_table.pl		antismash.cluster_table.pl
antismash.nrpspks_domain_table.pl		antismash.nrpspks_domain_table.pl
antismash.orf.pl		antismash.orf.pl
assembly_summary.genomic.pl		assembly_summary.genomic.pl
bam.region.sequence.pl		bam.region.sequence.pl
barplot.R		barplot.R
bash.pl		bash.pl
boxplot.R		boxplot.R
chromosome.match_depth.pl		chromosome.match_depth.pl
chromosome_plasmid.assembly.mapping.pl		chromosome_plasmid.assembly.mapping.pl
comma2tab.pl		comma2tab.pl
correctReadDepth.R		correctReadDepth.R
count.pl		count.pl
diamond.blastx.count.depth.pl		diamond.blastx.count.depth.pl
diamond.blastx.full_coverage.pl		diamond.blastx.full_coverage.pl
distance_matrix.mst_edge.R		distance_matrix.mst_edge.R
distance_matrix.mst_plot.R		distance_matrix.mst_plot.R
distance_matrix.phylip.pl		distance_matrix.phylip.pl
distance_matrix.pl		distance_matrix.pl
distance_methylation_count.pl		distance_methylation_count.pl
download_SRA_run_from_EBI.pl		download_SRA_run_from_EBI.pl
download_SRA_sample.pl		download_SRA_sample.pl
extract_fastq.pl		extract_fastq.pl
fasta.IUPAC2ACGT.pl		fasta.IUPAC2ACGT.pl
fasta.index.pl		fasta.index.pl
fasta.kmer.pl		fasta.kmer.pl
fasta.kmer_count.pl		fasta.kmer_count.pl
fasta.length.pl		fasta.length.pl
fasta.lineLength.pl		fasta.lineLength.pl
fasta.motif_count.pl		fasta.motif_count.pl
fasta.sequence.pl		fasta.sequence.pl
fasta.translate.pl		fasta.translate.pl
fasta2phylip.pl		fasta2phylip.pl
fasta2phylip.py		fasta2phylip.py
fastq.base_quality.base_count.pl		fastq.base_quality.base_count.pl
fastq.filter_pair.pl		fastq.filter_pair.pl
fastq.interleaved.pl		fastq.interleaved.pl
fastq.kmer_count.pl		fastq.kmer_count.pl
fastq.position_base_quality_ratio_table.pl		fastq.position_base_quality_ratio_table.pl
fastq.position_base_ratio_table.pl		fastq.position_base_ratio_table.pl
fastq.read_length.read_count.pl		fastq.read_length.read_count.pl
fixNewline.pl		fixNewline.pl
generate_scripts.RNAseq.pl		generate_scripts.RNAseq.pl
genome.depth.pl		genome.depth.pl
gggenes.R		gggenes.R
hclust.R		hclust.R
hclust.order.R		hclust.order.R
hclust_fan.R		hclust_fan.R
hclust_fan.color.R		hclust_fan.color.R
html_table.pl		html_table.pl
json.extract.pl		json.extract.pl
json.table.pl		json.table.pl
jukes_cantor_distance_matrix.pl		jukes_cantor_distance_matrix.pl
kraken2.read_count.pl		kraken2.read_count.pl
linePlot.R		linePlot.R
makeNewick.R		makeNewick.R
mds_plot.R		mds_plot.R
minimap2.reference_fastq.pl		minimap2.reference_fastq.pl
nj.color.R		nj.color.R
nj_fan.color.R		nj_fan.color.R
nj_tree.R		nj_tree.R
nj_tree.newick.R		nj_tree.newick.R
nj_tree_cladogram.R		nj_tree_cladogram.R
nj_tree_daylight.R		nj_tree_daylight.R
paf.minimum_match.pl		paf.minimum_match.pl
pdb.chain.pl		pdb.chain.pl
pdb.fasta.pl		pdb.fasta.pl
pdb.heterogen.count.pl		pdb.heterogen.count.pl
pdb.residue_pair.distance.pl		pdb.residue_pair.distance.pl
phylip.sample.pl		phylip.sample.pl
phylowgs.summ.fraction.pl		phylowgs.summ.fraction.pl
phylowgs.summ.plot_tree.pl		phylowgs.summ.plot_tree.pl
phylowgs.summ.sample_name.pl		phylowgs.summ.sample_name.pl
phylowgs.summ.select_tree.pl		phylowgs.summ.select_tree.pl
phylowgs.summ.tree_edge.pl		phylowgs.summ.tree_edge.pl
pileup.base_to_base_count.pl		pileup.base_to_base_count.pl
pileup.variant.pl		pileup.variant.pl
plot_graph_pie.R		plot_graph_pie.R
png2html_table.color.pl		png2html_table.color.pl
png2html_table.pl		png2html_table.pl
raxml.R		raxml.R
region.common.pl		region.common.pl
region.continue.pl		region.continue.pl
region.depth.pl		region.depth.pl
region.fasta.pl		region.fasta.pl
region.homozygosity_rate.pl		region.homozygosity_rate.pl
region.image.pl		region.image.pl
region.overlap_count.pl		region.overlap_count.pl
region.plot.R		region.plot.R
region.read_count.pl		region.read_count.pl
region.sequence.pl		region.sequence.pl

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Repository files navigation

NGS_scripts

Requirements

Install

Human RNA-seq data analysis

About

Uh oh!

Releases 1

Packages

Uh oh!

Contributors

Uh oh!

Languages

Folders and files

Latest commit

History

Repository files navigation

NGS_scripts

Requirements

Install

Human RNA-seq data analysis

About

Resources

Uh oh!

Stars

Watchers

Forks

Releases 1

Packages 0

Uh oh!

Contributors

Uh oh!

Languages

Packages