0% found this document useful (0 votes)

24 views9 pages

Group17 2

Uploaded by

sandeep

We take content rights seriously. If you suspect this is your content, claim it here.

Available Formats

Download as PDF, TXT or read online on Scribd

0% found this document useful (0 votes)

24 views9 pages

Group17 2

Uploaded by

sandeep

We take content rights seriously. If you suspect this is your content, claim it here.

Available Formats

Download as PDF, TXT or read online on Scribd

You are on page 1/ 9

group17

September 18, 2023

1 Question 1
Solution
Finding all possible k-mers

[27]: import concurrent.futures

import logging
import sys
import math
from collections import Counter
from tqdm import tqdm

class FastaParser:
def __init__(self, file_path):
self.file_path = file_path
self.sequences = {}
self.sequence_lengths = {} # Store lengths of sequences for quick␣
↪access

def parse(self):
current_sequence = []
current_sequence_name = None

try:
with open(self.file_path, 'r') as file:
for line in file:
line = line.strip()
if line.startswith('>'):
# If a new sequence header is encountered, store the␣
↪previous sequence

if current_sequence:
sequence_data = ''.join(current_sequence)
self.sequences[current_sequence_name] =␣
↪sequence_data

self.sequence_lengths[current_sequence_name] =␣
↪len(sequence_data)

1
# Set the current sequence name and reset the current␣
↪sequence
current_sequence_name = line[1:]
current_sequence = []
else:
# Append the current line to the current sequence
current_sequence.append(line)

# Store the last sequence encountered

if current_sequence:
sequence_data = ''.join(current_sequence)
self.sequences[current_sequence_name] = sequence_data
self.sequence_lengths[current_sequence_name] =␣
↪len(sequence_data)

except Exception as e:
logging.error(f"Error while parsing: {str(e)}")

def get_sequences(self):
return self.sequences

def get_sequence_lengths(self):
return self.sequence_lengths

# Define a class to count k-mers in sequences

class KmerCounter:
def __init__(self, k):
self.k = k
self.kmer_counts = Counter() # Use Counter for efficient counting
self.total_kmers = 0

def count_kmers(self, sequence):

sequence_length = len(sequence)

# Initialize a Counter to track k-mer counts

kmer_counter = Counter()

# Create a sliding window of size k

for i in range(sequence_length - self.k + 1):
kmer = sequence[i:i + self.k]

# Increment the count of the k-mer in the Counter

kmer_counter.update([kmer])

2
# Merge the counts from the local Counter into the main kmer_counts␣
↪dictionary
self.kmer_counts.update(kmer_counter)

# Update the total_kmers count based on the local kmer_counter

self.total_kmers += sum(kmer_counter.values())

def get_kmer_counts(self):
return self.kmer_counts

# Function to count k-mers in a FASTA file

def count_kmers_in_fasta(file_path, k, num_threads=1):
fasta_parser = FastaParser(file_path)
fasta_parser.parse()
sequences = fasta_parser.get_sequences()

kmer_counter = KmerCounter(k)

def count_kmers_for_sequence(sequence):
kmer_counter.count_kmers(sequence)

with concurrent.futures.ThreadPoolExecutor(max_workers=num_threads) as␣

↪executor:
list(executor.map(count_kmers_for_sequence, sequences.values()))

return kmer_counter.get_kmer_counts()

if __name__ == "__main__":
file_path = 'sequences.fasta'
k = int(input("Enter the desired k-mer size: "))

num_threads = 2

kmer_counts = count_kmers_in_fasta(file_path, k, num_threads)

for kmer, count in kmer_counts.items():

print(f"{kmer}: {count}")

Enter the desired k-mer size: 3

GAT: 470
ATT: 748
TTT: 896
TTA: 806
TAA: 636
AAG: 520
AGT: 504

3
GTG: 568
TGA: 611
GAA: 391
AAT: 626
ATA: 486
TAG: 405
AGC: 343
GCT: 578
CTT: 741
TTG: 840
TGG: 587
GGC: 331
CTA: 564
TAT: 716
ATC: 349
TCT: 666
CTC: 423
TCA: 504
CAC: 422
ACT: 641
TTC: 584
TCC: 332
CCC: 226
CCT: 418
TCG: 191
CGT: 269
GTT: 741
TGC: 622
GCA: 453
CAG: 422
AGA: 437
AAC: 446
ACG: 200
CGA: 137
AAA: 589
GCC: 246
CTG: 575
TGT: 885
GCG: 195
GTA: 468
GTC: 337
GGT: 456
GGG: 162
GGA: 281
CAT: 493
AGG: 356
GAC: 313
ACA: 563

4
ATG: 722
CAA: 565
TAC: 566
GAG: 293
CGG: 125
CCG: 121
ACC: 343
CGC: 176
CCA: 382

Question 2

Solution
Extracting sequences from Fasta file

[25]: import pandas as pd

from Bio import SeqIO
with open('sequences2.fasta') as fasta_file:
identifiers = []
for seq_record in SeqIO.parse(fasta_file, 'fasta'): # (generator)
identifiers.append(str(seq_record.seq))

Edit distance calculation using function

[20]: def edit_distance(s, t):
m, n = len(s), len(t)
dp = [[0] * (n + 1) for _ in range(m + 1)]
for i in range(m + 1):
dp[i][0] = i
for j in range(n + 1):
dp[0][j] = j
for i in range(1, m + 1):
for j in range(1, n + 1):
if s[i - 1] == t[j - 1]:
cost=0
else:
cost=1
dp[i][j] = min(dp[i - 1][j] + 1,dp[i][j - 1] + 1,dp[i - 1][j - 1] +␣
↪cost)

return dp[m][n]

# Sample Case
s = "PLEASANTLY"
t = "MEANLY"
print()
print("s:", identifiers[2])

5
print()
print("t:", identifiers[3])
print("Edit Distance :",edit_distance(identifiers[2], identifiers[3]))

s: DVTPVDQYMCGVDGKPISAYAFLMAKDGITKLADVEADVAARADDEGFITLKNNLYRLVWHVERKDVPYPKQSIFTI
NSVVQKDGVENTPPHYFTLGCKILTLTPRNKWSGVSDLSLKQKLLYTFYGKESLENPTYIYHSAFIECGSCGNDSWLTGN
AIQGFACGCGASYTANDVEVQSSGMIKPNALLCATCPFAKGDSCSSNCKHSVAQLVSYLSERCNVIADSKSFTLIFGGVA
YAYFGCEEGTMYFVPRAKSVVSRIGDSIFTGCTGSWNKVTQIANMFLEQTQHSLNFVGEFVVNDVVLAILSGTTTNVDKI
RQLLKGVTLDKLRDYLADYDVAVTAGPFMDNAINVGGTGLQYAAITAPYVVLTGLGESFKKVATIPYKVCNSVKDTLTYY
AHSVLYRVFPYDMDSGVSSFSELLFDCVDLSVASTYFLVRLLQDKTGDFMSTIITSCQTAVSKLLDTCFEATEATFNFLL
DLAGLFRIFLRNAYVYTSQGFVVVNGKVSTLVKQVLDLLNKGMQLLHTKVSWAGSNISAVIYSGRESLIFPSGTYYCVTT
KAKSVQQDLDVILPGEFSKKQLGLLQPTDNSTTVSVTVSSNMVETVVGQLEQTNMHSPDVIVGDYVIISEKLFVRSKEED
GFAFYPACTNGHAVPTLFRLKGG

t: APTWFNALRDFTLKGYVLATIIVFLCAVLMYLCLPTFSMVPVEFYEDRILDFKVLDNGIIRDVNPDDKCFANKHRSF
TQWYHEHVGGVYDNSITCPLTVAVIAGVAGARIPDVPTTLAWVNNQIIFFVSRVFANTGSVCYTPIDEIPYKSFSDSGCI
LPSECTMFRDAEGRMTPYCHDPTVLPGAFAYSQMRPHVRYDLYDGNMFIKFPEVVFESTLRITRTLSTQYCRFGSCEYAQ
EGVCITTNGSWAIFNDHHLNRPGVYCGSDFIDIVRRLAVSLFQPITYFQLTTSLVLGIGLCAFLTLLFYYINKVKRAFAD
YTQCAVIAVVAAVLNSLCICFVASIPLCIVPYTALYYYATFYFTNEPAFIMHVSWYIMFGPIVPIWMTCVYTVAMCFRHF
FWVLAYFSKKHVEVFTDGKLNCSFQDAASNIFVINKDTYAALRNSLTNDAYSRFLGLFNKYKYFSGAMETAAYREAAACH
LAKALQTYSETGSDLLYQPPNCSITSGVLQ
Edit Distance : 519

Question 3

Solution:
Extracting sequences from FASTA file

[ ]: import pandas as pd
from Bio import SeqIO
with open('sequences3.fasta') as fasta_file: # Will close handle cleanly
identifiers = []
#read Sequence from FASTA file
for seq_record in SeqIO.parse(fasta_file, 'fasta'): # (generator)
identifiers.append(str(seq_record.seq))

Function to compute the aligned sequence based on the data collected in trace-
back matrix
[17]: def compute_aligned_sequences(traceback,s,t):
aligned_s = ""
aligned_t = ""
i, j = len(s), len(t)

while i > 0 or j > 0:

6
# Substitution
if i > 0 and j > 0 and traceback[i][j] == "S":
aligned_s = s[i - 1] + aligned_s
aligned_t = t[j - 1] + aligned_t
i -= 1
j -= 1

# Deletion from t sequence

elif i > 0 and traceback[i][j] == "D":
aligned_s = s[i - 1] + aligned_s
aligned_t = "-" + aligned_t
i -= 1

#Insertion to t sequence
elif j > 0 and traceback[i][j] == "I":
aligned_s = "-" + aligned_s
aligned_t = t[j - 1] + aligned_t
j -= 1

return aligned_s,aligned_t

6.0.3 Computing the edit distance and forming the traceback matrix

[21]: def compute_optimal_alignment(s, t):

m, n = len(s), len(t)

# Initialize the dynamic programming matrix for computing edit distance

dp = [[0 for _ in range(n + 1)] for _ in range(m + 1)]

# Initialize traceback matrix to help find out the aligned sequences by␣
↪capturing the operations at each step

traceback = [[""] * (n + 1) for _ in range(m + 1)]

#dp[i][j] refers to computing alignment with first i elements from sequence␣

↪s and first j elements from sequence t

for i in range(1, m + 1):

dp[i][0] = i*2
traceback[i][0] = "D" # Deletion

for j in range(1, n + 1):

dp[0][j] = j*2
traceback[0][j] = "I" # Insertion

for i in range(1, m + 1):

for j in range(1, n + 1):

7
# Match corresponds to a penalty of 0, mismatch corresponds to a␣
↪penalty of 3
penalty = 0 if s[i - 1] == t[j - 1] else 3

# Gap corresponds to a penalty of 2

delete_penalty = dp[i - 1][j] + 2
insert_penalty = dp[i][j - 1] + 2

substitute_penalty = dp[i - 1][j - 1] + penalty

# Find the minimum penalty among deletion, insertion, and␣

↪substitution
min_penalty = min(delete_penalty, insert_penalty,␣
↪substitute_penalty)

# Update the traceback matrix to keep track of the optimal path

if min_penalty == delete_penalty:
traceback[i][j] = "D" # Deletion
elif min_penalty == insert_penalty:
traceback[i][j] = "I" # Insertion
else:
traceback[i][j] = "S" # Substitution

dp[i][j] = min_penalty

return dp[m][n], traceback

Printing the edit distance followed by two augmented strings s and t repre-
senting an optimal alignment of s and t.

[23]: file_path = 'sequences3.fasta'

sequence1 = identifiers[0]
sequence2 = identifiers[1]

edit_distance, traceback = compute_optimal_alignment(sequence1, sequence2)

aligned_sequence1,␣
↪aligned_sequence2=compute_aligned_sequences(traceback,sequence1,sequence2)

print("Edit Distance:", edit_distance)

print()
print("s:", sequence1)
print("s':", aligned_sequence1)
print()
print("t:", sequence2)
print("t':", aligned_sequence2)

8
Edit Distance: 358

s: MPIPPLRKMLGIGGDRTEKLIPGMELSNWLPGGTSTTLELDPKQHSHSGLLRMASFGSMKMAPLMLLQLLGRGTLTM
IQLLLHNSRPVLSFLKTSTLRGLEAIVNHLQEPLA
s': MPIPPLRKMLGIGGD-RT----EK------LI-P--GM----E-LSNWLPGGTSTT------L---E-LD--PKQ
---HS-HS-GL---L--R--------M----A-SF---GSM-------K---MA-PLML-L----Q--LL---GRGTL--
TMIQLLLHN-SRPVLS--FL---KTSTLRGLEAIVNHLQEPLA--

t: MSFVAGVTAQGARGTYRAALNSEKHQDHVSLTVPLCGSGNLVEKLSPWFMDGENAYEVVKAMLLKKEPLLYVPIRLA
GHTRHLPGPRVYLVERLIACENPFMVNQLAYSSSANGSLVGTTLQGKPIGMFFPYDIELVTGKQNILLRKYGRGGYHYTP
FHYERDNTSCPEWMDDFEADPKGKYAQNLLKKLIGG
t': MSFVAGVTAQGARGTYRAALNSEKHQDHVSLTVPLCGSGNLVEKLSPWFMDGENAYEVVKAMLLKKEPLLYVPIRL
AGHTRHLPGPRVYLVERLIACENPFMVNQLAYSSSANGSLVGTTLQGKPIGMFFPYDIELVTGKQNILLRKYGRGGYHYT
PFHYERDNTSCPEWMDDFEADPK-----GKYAQ-NLLKK-LIGG

[ ]:

Graph Algorithms: BFS, DFS, and Applications
No ratings yet
Graph Algorithms: BFS, DFS, and Applications
8 pages
Ai SRK
No ratings yet
Ai SRK
19 pages
AI - Programs KP Print
No ratings yet
AI - Programs KP Print
14 pages
Ai Lab - R22
No ratings yet
Ai Lab - R22
20 pages
13 Object Oriented Programming - Python Solutions 1.5 Documentation
No ratings yet
13 Object Oriented Programming - Python Solutions 1.5 Documentation
5 pages
FINALailabfile
No ratings yet
FINALailabfile
26 pages
AI and ML Lab Program
No ratings yet
AI and ML Lab Program
24 pages
HPC Practical 2025
No ratings yet
HPC Practical 2025
19 pages
AI Lab File Prince
No ratings yet
AI Lab File Prince
12 pages
Artificial Intelligencefile
No ratings yet
Artificial Intelligencefile
42 pages
AI Lab Prog-1,2,3
No ratings yet
AI Lab Prog-1,2,3
10 pages
AIML Manual - Merged
No ratings yet
AIML Manual - Merged
41 pages
Aiml Lab Manual
No ratings yet
Aiml Lab Manual
44 pages
AI&ML
No ratings yet
AI&ML
38 pages
AI Practicals
No ratings yet
AI Practicals
47 pages
Ai Code
No ratings yet
Ai Code
21 pages
Artificial Intelligence Lab File
No ratings yet
Artificial Intelligence Lab File
10 pages
Ada Lab File
No ratings yet
Ada Lab File
25 pages
Python Functions and Oop
No ratings yet
Python Functions and Oop
7 pages
Aiml Lab
No ratings yet
Aiml Lab
52 pages
bfaceb86-a886-404c-9378-3a8fb918889a
No ratings yet
bfaceb86-a886-404c-9378-3a8fb918889a
10 pages
Cs3491 - Aiml Lab Record
No ratings yet
Cs3491 - Aiml Lab Record
26 pages
Aiml Programs
No ratings yet
Aiml Programs
24 pages
AAI Journal
No ratings yet
AAI Journal
17 pages
AI Lets Go
No ratings yet
AI Lets Go
28 pages
CS3491 - AIML Lab Record
No ratings yet
CS3491 - AIML Lab Record
79 pages
Design and Analysis of Algorithm Lab Manual - Answers
No ratings yet
Design and Analysis of Algorithm Lab Manual - Answers
13 pages
Ai Outputs
No ratings yet
Ai Outputs
12 pages
Artificial Intelligence Lab File
No ratings yet
Artificial Intelligence Lab File
16 pages
AIML Lab
No ratings yet
AIML Lab
42 pages
Output
No ratings yet
Output
65 pages
Function Solutions
No ratings yet
Function Solutions
10 pages
257 TYB CP Pract 6
No ratings yet
257 TYB CP Pract 6
11 pages
Al Codes
No ratings yet
Al Codes
26 pages
Aa Lab Manual
No ratings yet
Aa Lab Manual
10 pages
Daa Record 6 To 9
No ratings yet
Daa Record 6 To 9
28 pages
Final AI LAB FILE
No ratings yet
Final AI LAB FILE
20 pages
ADA LAB Manual-1
No ratings yet
ADA LAB Manual-1
42 pages
Aiml Lab Exps
No ratings yet
Aiml Lab Exps
16 pages
Ai Lab Programs
No ratings yet
Ai Lab Programs
12 pages
AI and ML Manual
No ratings yet
AI and ML Manual
11 pages
AI File
No ratings yet
AI File
26 pages
P03 A Star Algorithm 35 Anushka Shetty
No ratings yet
P03 A Star Algorithm 35 Anushka Shetty
23 pages
AI Practical Assignments
No ratings yet
AI Practical Assignments
12 pages
AD 304 Artificial Intelligence Lab Manual
No ratings yet
AD 304 Artificial Intelligence Lab Manual
33 pages
Programs For AI
No ratings yet
Programs For AI
33 pages
Infytq
No ratings yet
Infytq
34 pages
A044 AI Pract Prathamesh
No ratings yet
A044 AI Pract Prathamesh
23 pages
Artificial Intelligence Lab
No ratings yet
Artificial Intelligence Lab
11 pages
ADA Programs
No ratings yet
ADA Programs
5 pages
AI Lab File Vivek Pandey
No ratings yet
AI Lab File Vivek Pandey
26 pages
AILABCODES
No ratings yet
AILABCODES
8 pages
AIML Final Programs
No ratings yet
AIML Final Programs
8 pages
Certificate: MSG-SGKM College Arts, Science and Commerce
No ratings yet
Certificate: MSG-SGKM College Arts, Science and Commerce
34 pages
Notes
No ratings yet
Notes
48 pages
Lab Manual AI Lab VI Sem
No ratings yet
Lab Manual AI Lab VI Sem
34 pages
Ai 3
No ratings yet
Ai 3
8 pages
New Compiler Practls
No ratings yet
New Compiler Practls
17 pages
IT-5203 - Flow Control PDF
No ratings yet
IT-5203 - Flow Control PDF
32 pages
Model SXT 5 SXT Lite5: 24V 0.38A Adapter Mounting Ring Pole Mounting Bracket Poe Injector 24V 0.8A Adapter
No ratings yet
Model SXT 5 SXT Lite5: 24V 0.38A Adapter Mounting Ring Pole Mounting Bracket Poe Injector 24V 0.8A Adapter
1 page
Lifebuoy Soap: Present By: Bheem Soothar 1539105 Naveed Iftikhar 1539117
No ratings yet
Lifebuoy Soap: Present By: Bheem Soothar 1539105 Naveed Iftikhar 1539117
5 pages
Bulk Material Density Chart: Product Type Product Type
No ratings yet
Bulk Material Density Chart: Product Type Product Type
4 pages
Class XI - Maths Assignment - Basic Maths
No ratings yet
Class XI - Maths Assignment - Basic Maths
4 pages
Cloud Answers
No ratings yet
Cloud Answers
29 pages
Molecular Biotechnology 6th Edition Bernard R. Glick Online PDF
No ratings yet
Molecular Biotechnology 6th Edition Bernard R. Glick Online PDF
119 pages
The Great Gatsby: Jazz Age Critique
No ratings yet
The Great Gatsby: Jazz Age Critique
13 pages
Tamil Fish
No ratings yet
Tamil Fish
14 pages
Investigators Directory - CW - 2.0
No ratings yet
Investigators Directory - CW - 2.0
202 pages
Anodizing Aluminum: ECP Lab Report
100% (1)
Anodizing Aluminum: ECP Lab Report
15 pages
Request For Proposal17
No ratings yet
Request For Proposal17
103 pages
WALKER, L. L. Language of Amos
No ratings yet
WALKER, L. L. Language of Amos
13 pages
28 Day Shred Day05
No ratings yet
28 Day Shred Day05
2 pages
200 Word Stress
No ratings yet
200 Word Stress
6 pages
Doctor Job Application Letter Sample
No ratings yet
Doctor Job Application Letter Sample
20 pages
Xerostomia: Dental Student Guide
No ratings yet
Xerostomia: Dental Student Guide
16 pages
Grade 5 Science: Plant Reproduction
100% (1)
Grade 5 Science: Plant Reproduction
13 pages
Answer
No ratings yet
Answer
1 page
EE PPT
No ratings yet
EE PPT
63 pages
CH 014
No ratings yet
CH 014
25 pages
Research Methods
No ratings yet
Research Methods
2 pages
Philippines-Japan Local Administration Seminar 2015
No ratings yet
Philippines-Japan Local Administration Seminar 2015
62 pages
Byte Filling Function Guide
No ratings yet
Byte Filling Function Guide
3 pages
Āryabha A Ganit Challenge 2022: Maximum Marks: 40 Duration: 1 Hour
100% (2)
Āryabha A Ganit Challenge 2022: Maximum Marks: 40 Duration: 1 Hour
9 pages
Day 3 Health - q1 - Health
No ratings yet
Day 3 Health - q1 - Health
4 pages
Question Bank U3& U4-1
No ratings yet
Question Bank U3& U4-1
18 pages
KV38,50 Qual
No ratings yet
KV38,50 Qual
103 pages
Distal Tubule Balance and Tubuloglomerular Feedback-Group 2
No ratings yet
Distal Tubule Balance and Tubuloglomerular Feedback-Group 2
42 pages
CCNA 4 Exam Answers & Networking Tips
No ratings yet
CCNA 4 Exam Answers & Networking Tips
28 pages

Group17 2

Uploaded by

Group17 2

Uploaded by

group17

September 18, 2023

[27]: import concurrent.futures

# Store the last sequence encountered

# Define a class to count k-mers in sequences

def count_kmers(self, sequence):

# Initialize a Counter to track k-mer counts

# Create a sliding window of size k

# Increment the count of the k-mer in the Counter

# Update the total_kmers count based on the local kmer_counter

# Function to count k-mers in a FASTA file

with concurrent.futures.ThreadPoolExecutor(max_workers=num_threads) as␣

kmer_counts = count_kmers_in_fasta(file_path, k, num_threads)

for kmer, count in kmer_counts.items():

Enter the desired k-mer size: 3

[25]: import pandas as pd

Edit distance calculation using function

while i > 0 or j > 0:

# Deletion from t sequence

[21]: def compute_optimal_alignment(s, t):

# Initialize the dynamic programming matrix for computing edit distance

traceback = [[""] * (n + 1) for _ in range(m + 1)]

#dp[i][j] refers to computing alignment with first i elements from sequence␣

for i in range(1, m + 1):

for j in range(1, n + 1):

for i in range(1, m + 1):

# Gap corresponds to a penalty of 2

substitute_penalty = dp[i - 1][j - 1] + penalty

# Find the minimum penalty among deletion, insertion, and␣

# Update the traceback matrix to keep track of the optimal path

return dp[m][n], traceback

[23]: file_path = 'sequences3.fasta'

edit_distance, traceback = compute_optimal_alignment(sequence1, sequence2)

print("Edit Distance:", edit_distance)

You might also like