0% found this document useful (0 votes)
24 views9 pages

Group17 2

Uploaded by

sandeep
Copyright
© © All Rights Reserved
We take content rights seriously. If you suspect this is your content, claim it here.
Available Formats
Download as PDF, TXT or read online on Scribd
0% found this document useful (0 votes)
24 views9 pages

Group17 2

Uploaded by

sandeep
Copyright
© © All Rights Reserved
We take content rights seriously. If you suspect this is your content, claim it here.
Available Formats
Download as PDF, TXT or read online on Scribd
You are on page 1/ 9

group17

September 18, 2023

1 Question 1
Solution
Finding all possible k-mers

[27]: import concurrent.futures


import logging
import sys
import math
from collections import Counter
from tqdm import tqdm

class FastaParser:
def __init__(self, file_path):
self.file_path = file_path
self.sequences = {}
self.sequence_lengths = {} # Store lengths of sequences for quick␣
↪access

def parse(self):
current_sequence = []
current_sequence_name = None

try:
with open(self.file_path, 'r') as file:
for line in file:
line = line.strip()
if line.startswith('>'):
# If a new sequence header is encountered, store the␣
↪previous sequence

if current_sequence:
sequence_data = ''.join(current_sequence)
self.sequences[current_sequence_name] =␣
↪sequence_data

self.sequence_lengths[current_sequence_name] =␣
↪len(sequence_data)

1
# Set the current sequence name and reset the current␣
↪sequence
current_sequence_name = line[1:]
current_sequence = []
else:
# Append the current line to the current sequence
current_sequence.append(line)

# Store the last sequence encountered


if current_sequence:
sequence_data = ''.join(current_sequence)
self.sequences[current_sequence_name] = sequence_data
self.sequence_lengths[current_sequence_name] =␣
↪len(sequence_data)

except Exception as e:
logging.error(f"Error while parsing: {str(e)}")

def get_sequences(self):
return self.sequences

def get_sequence_lengths(self):
return self.sequence_lengths

# Define a class to count k-mers in sequences


class KmerCounter:
def __init__(self, k):
self.k = k
self.kmer_counts = Counter() # Use Counter for efficient counting
self.total_kmers = 0

def count_kmers(self, sequence):


sequence_length = len(sequence)

# Initialize a Counter to track k-mer counts


kmer_counter = Counter()

# Create a sliding window of size k


for i in range(sequence_length - self.k + 1):
kmer = sequence[i:i + self.k]

# Increment the count of the k-mer in the Counter


kmer_counter.update([kmer])

2
# Merge the counts from the local Counter into the main kmer_counts␣
↪dictionary
self.kmer_counts.update(kmer_counter)

# Update the total_kmers count based on the local kmer_counter


self.total_kmers += sum(kmer_counter.values())

def get_kmer_counts(self):
return self.kmer_counts

# Function to count k-mers in a FASTA file


def count_kmers_in_fasta(file_path, k, num_threads=1):
fasta_parser = FastaParser(file_path)
fasta_parser.parse()
sequences = fasta_parser.get_sequences()

kmer_counter = KmerCounter(k)

def count_kmers_for_sequence(sequence):
kmer_counter.count_kmers(sequence)

with concurrent.futures.ThreadPoolExecutor(max_workers=num_threads) as␣


↪executor:
list(executor.map(count_kmers_for_sequence, sequences.values()))

return kmer_counter.get_kmer_counts()

if __name__ == "__main__":
file_path = 'sequences.fasta'
k = int(input("Enter the desired k-mer size: "))

num_threads = 2

kmer_counts = count_kmers_in_fasta(file_path, k, num_threads)

for kmer, count in kmer_counts.items():


print(f"{kmer}: {count}")

Enter the desired k-mer size: 3


GAT: 470
ATT: 748
TTT: 896
TTA: 806
TAA: 636
AAG: 520
AGT: 504

3
GTG: 568
TGA: 611
GAA: 391
AAT: 626
ATA: 486
TAG: 405
AGC: 343
GCT: 578
CTT: 741
TTG: 840
TGG: 587
GGC: 331
CTA: 564
TAT: 716
ATC: 349
TCT: 666
CTC: 423
TCA: 504
CAC: 422
ACT: 641
TTC: 584
TCC: 332
CCC: 226
CCT: 418
TCG: 191
CGT: 269
GTT: 741
TGC: 622
GCA: 453
CAG: 422
AGA: 437
AAC: 446
ACG: 200
CGA: 137
AAA: 589
GCC: 246
CTG: 575
TGT: 885
GCG: 195
GTA: 468
GTC: 337
GGT: 456
GGG: 162
GGA: 281
CAT: 493
AGG: 356
GAC: 313
ACA: 563

4
ATG: 722
CAA: 565
TAC: 566
GAG: 293
CGG: 125
CCG: 121
ACC: 343
CGC: 176
CCA: 382

Question 2

Solution
Extracting sequences from Fasta file

[25]: import pandas as pd


from Bio import SeqIO
with open('sequences2.fasta') as fasta_file:
identifiers = []
for seq_record in SeqIO.parse(fasta_file, 'fasta'): # (generator)
identifiers.append(str(seq_record.seq))

Edit distance calculation using function


[20]: def edit_distance(s, t):
m, n = len(s), len(t)
dp = [[0] * (n + 1) for _ in range(m + 1)]
for i in range(m + 1):
dp[i][0] = i
for j in range(n + 1):
dp[0][j] = j
for i in range(1, m + 1):
for j in range(1, n + 1):
if s[i - 1] == t[j - 1]:
cost=0
else:
cost=1
dp[i][j] = min(dp[i - 1][j] + 1,dp[i][j - 1] + 1,dp[i - 1][j - 1] +␣
↪cost)

return dp[m][n]

# Sample Case
s = "PLEASANTLY"
t = "MEANLY"
print()
print("s:", identifiers[2])

5
print()
print("t:", identifiers[3])
print("Edit Distance :",edit_distance(identifiers[2], identifiers[3]))

s: DVTPVDQYMCGVDGKPISAYAFLMAKDGITKLADVEADVAARADDEGFITLKNNLYRLVWHVERKDVPYPKQSIFTI
NSVVQKDGVENTPPHYFTLGCKILTLTPRNKWSGVSDLSLKQKLLYTFYGKESLENPTYIYHSAFIECGSCGNDSWLTGN
AIQGFACGCGASYTANDVEVQSSGMIKPNALLCATCPFAKGDSCSSNCKHSVAQLVSYLSERCNVIADSKSFTLIFGGVA
YAYFGCEEGTMYFVPRAKSVVSRIGDSIFTGCTGSWNKVTQIANMFLEQTQHSLNFVGEFVVNDVVLAILSGTTTNVDKI
RQLLKGVTLDKLRDYLADYDVAVTAGPFMDNAINVGGTGLQYAAITAPYVVLTGLGESFKKVATIPYKVCNSVKDTLTYY
AHSVLYRVFPYDMDSGVSSFSELLFDCVDLSVASTYFLVRLLQDKTGDFMSTIITSCQTAVSKLLDTCFEATEATFNFLL
DLAGLFRIFLRNAYVYTSQGFVVVNGKVSTLVKQVLDLLNKGMQLLHTKVSWAGSNISAVIYSGRESLIFPSGTYYCVTT
KAKSVQQDLDVILPGEFSKKQLGLLQPTDNSTTVSVTVSSNMVETVVGQLEQTNMHSPDVIVGDYVIISEKLFVRSKEED
GFAFYPACTNGHAVPTLFRLKGG

t: APTWFNALRDFTLKGYVLATIIVFLCAVLMYLCLPTFSMVPVEFYEDRILDFKVLDNGIIRDVNPDDKCFANKHRSF
TQWYHEHVGGVYDNSITCPLTVAVIAGVAGARIPDVPTTLAWVNNQIIFFVSRVFANTGSVCYTPIDEIPYKSFSDSGCI
LPSECTMFRDAEGRMTPYCHDPTVLPGAFAYSQMRPHVRYDLYDGNMFIKFPEVVFESTLRITRTLSTQYCRFGSCEYAQ
EGVCITTNGSWAIFNDHHLNRPGVYCGSDFIDIVRRLAVSLFQPITYFQLTTSLVLGIGLCAFLTLLFYYINKVKRAFAD
YTQCAVIAVVAAVLNSLCICFVASIPLCIVPYTALYYYATFYFTNEPAFIMHVSWYIMFGPIVPIWMTCVYTVAMCFRHF
FWVLAYFSKKHVEVFTDGKLNCSFQDAASNIFVINKDTYAALRNSLTNDAYSRFLGLFNKYKYFSGAMETAAYREAAACH
LAKALQTYSETGSDLLYQPPNCSITSGVLQ
Edit Distance : 519

Question 3

Solution:
Extracting sequences from FASTA file

[ ]: import pandas as pd
from Bio import SeqIO
with open('sequences3.fasta') as fasta_file: # Will close handle cleanly
identifiers = []
#read Sequence from FASTA file
for seq_record in SeqIO.parse(fasta_file, 'fasta'): # (generator)
identifiers.append(str(seq_record.seq))

Function to compute the aligned sequence based on the data collected in trace-
back matrix
[17]: def compute_aligned_sequences(traceback,s,t):
aligned_s = ""
aligned_t = ""
i, j = len(s), len(t)

while i > 0 or j > 0:

6
# Substitution
if i > 0 and j > 0 and traceback[i][j] == "S":
aligned_s = s[i - 1] + aligned_s
aligned_t = t[j - 1] + aligned_t
i -= 1
j -= 1

# Deletion from t sequence


elif i > 0 and traceback[i][j] == "D":
aligned_s = s[i - 1] + aligned_s
aligned_t = "-" + aligned_t
i -= 1

#Insertion to t sequence
elif j > 0 and traceback[i][j] == "I":
aligned_s = "-" + aligned_s
aligned_t = t[j - 1] + aligned_t
j -= 1

return aligned_s,aligned_t

6.0.3 Computing the edit distance and forming the traceback matrix

[21]: def compute_optimal_alignment(s, t):


m, n = len(s), len(t)

# Initialize the dynamic programming matrix for computing edit distance


dp = [[0 for _ in range(n + 1)] for _ in range(m + 1)]

# Initialize traceback matrix to help find out the aligned sequences by␣
↪capturing the operations at each step

traceback = [[""] * (n + 1) for _ in range(m + 1)]

#dp[i][j] refers to computing alignment with first i elements from sequence␣


↪s and first j elements from sequence t

for i in range(1, m + 1):


dp[i][0] = i*2
traceback[i][0] = "D" # Deletion

for j in range(1, n + 1):


dp[0][j] = j*2
traceback[0][j] = "I" # Insertion

for i in range(1, m + 1):


for j in range(1, n + 1):

7
# Match corresponds to a penalty of 0, mismatch corresponds to a␣
↪penalty of 3
penalty = 0 if s[i - 1] == t[j - 1] else 3

# Gap corresponds to a penalty of 2


delete_penalty = dp[i - 1][j] + 2
insert_penalty = dp[i][j - 1] + 2

substitute_penalty = dp[i - 1][j - 1] + penalty

# Find the minimum penalty among deletion, insertion, and␣


↪substitution
min_penalty = min(delete_penalty, insert_penalty,␣
↪substitute_penalty)

# Update the traceback matrix to keep track of the optimal path


if min_penalty == delete_penalty:
traceback[i][j] = "D" # Deletion
elif min_penalty == insert_penalty:
traceback[i][j] = "I" # Insertion
else:
traceback[i][j] = "S" # Substitution

dp[i][j] = min_penalty

return dp[m][n], traceback

Printing the edit distance followed by two augmented strings s and t repre-
senting an optimal alignment of s and t.

[23]: file_path = 'sequences3.fasta'

sequence1 = identifiers[0]
sequence2 = identifiers[1]

edit_distance, traceback = compute_optimal_alignment(sequence1, sequence2)


aligned_sequence1,␣
↪aligned_sequence2=compute_aligned_sequences(traceback,sequence1,sequence2)

print("Edit Distance:", edit_distance)


print()
print("s:", sequence1)
print("s':", aligned_sequence1)
print()
print("t:", sequence2)
print("t':", aligned_sequence2)

8
Edit Distance: 358

s: MPIPPLRKMLGIGGDRTEKLIPGMELSNWLPGGTSTTLELDPKQHSHSGLLRMASFGSMKMAPLMLLQLLGRGTLTM
IQLLLHNSRPVLSFLKTSTLRGLEAIVNHLQEPLA
s': MPIPPLRKMLGIGGD-RT----EK------LI-P--GM----E-LSNWLPGGTSTT------L---E-LD--PKQ
---HS-HS-GL---L--R--------M----A-SF---GSM-------K---MA-PLML-L----Q--LL---GRGTL--
TMIQLLLHN-SRPVLS--FL---KTSTLRGLEAIVNHLQEPLA--

t: MSFVAGVTAQGARGTYRAALNSEKHQDHVSLTVPLCGSGNLVEKLSPWFMDGENAYEVVKAMLLKKEPLLYVPIRLA
GHTRHLPGPRVYLVERLIACENPFMVNQLAYSSSANGSLVGTTLQGKPIGMFFPYDIELVTGKQNILLRKYGRGGYHYTP
FHYERDNTSCPEWMDDFEADPKGKYAQNLLKKLIGG
t': MSFVAGVTAQGARGTYRAALNSEKHQDHVSLTVPLCGSGNLVEKLSPWFMDGENAYEVVKAMLLKKEPLLYVPIRL
AGHTRHLPGPRVYLVERLIACENPFMVNQLAYSSSANGSLVGTTLQGKPIGMFFPYDIELVTGKQNILLRKYGRGGYHYT
PFHYERDNTSCPEWMDDFEADPK-----GKYAQ-NLLKK-LIGG

[ ]:

You might also like