group17
September 18, 2023
     1    Question 1
     Solution
     Finding all possible k-mers
[27]: import concurrent.futures
      import logging
      import sys
      import math
      from collections import Counter
      from tqdm import tqdm
      class FastaParser:
          def __init__(self, file_path):
              self.file_path = file_path
              self.sequences = {}
              self.sequence_lengths = {}    # Store lengths of sequences for quick␣
       ↪access
           def parse(self):
               current_sequence = []
               current_sequence_name = None
                try:
                    with open(self.file_path, 'r') as file:
                        for line in file:
                            line = line.strip()
                            if line.startswith('>'):
                                # If a new sequence header is encountered, store the␣
         ↪previous sequence
                                if current_sequence:
                                    sequence_data = ''.join(current_sequence)
                                    self.sequences[current_sequence_name] =␣
         ↪sequence_data
                                    self.sequence_lengths[current_sequence_name] =␣
         ↪len(sequence_data)
                                                1
                         # Set the current sequence name and reset the current␣
 ↪sequence
                        current_sequence_name = line[1:]
                        current_sequence = []
                    else:
                        # Append the current line to the current sequence
                        current_sequence.append(line)
                # Store the last sequence encountered
                if current_sequence:
                    sequence_data = ''.join(current_sequence)
                    self.sequences[current_sequence_name] = sequence_data
                    self.sequence_lengths[current_sequence_name] =␣
 ↪len(sequence_data)
        except Exception as e:
            logging.error(f"Error while parsing: {str(e)}")
     def get_sequences(self):
         return self.sequences
     def get_sequence_lengths(self):
         return self.sequence_lengths
# Define a class to count k-mers in sequences
class KmerCounter:
    def __init__(self, k):
        self.k = k
        self.kmer_counts = Counter() # Use Counter for efficient counting
        self.total_kmers = 0
     def count_kmers(self, sequence):
         sequence_length = len(sequence)
        # Initialize a Counter to track k-mer counts
        kmer_counter = Counter()
        # Create a sliding window of size k
        for i in range(sequence_length - self.k + 1):
            kmer = sequence[i:i + self.k]
             # Increment the count of the k-mer in the Counter
             kmer_counter.update([kmer])
                                           2
         # Merge the counts from the local Counter into the main kmer_counts␣
  ↪dictionary
         self.kmer_counts.update(kmer_counter)
             # Update the total_kmers count based on the local kmer_counter
             self.total_kmers += sum(kmer_counter.values())
       def get_kmer_counts(self):
           return self.kmer_counts
# Function to count k-mers in a FASTA file
def count_kmers_in_fasta(file_path, k, num_threads=1):
    fasta_parser = FastaParser(file_path)
    fasta_parser.parse()
    sequences = fasta_parser.get_sequences()
       kmer_counter = KmerCounter(k)
       def count_kmers_for_sequence(sequence):
           kmer_counter.count_kmers(sequence)
     with concurrent.futures.ThreadPoolExecutor(max_workers=num_threads) as␣
  ↪executor:
         list(executor.map(count_kmers_for_sequence, sequences.values()))
       return kmer_counter.get_kmer_counts()
if __name__ == "__main__":
    file_path = 'sequences.fasta'
    k = int(input("Enter the desired k-mer size: "))
       num_threads = 2
       kmer_counts = count_kmers_in_fasta(file_path, k, num_threads)
       for kmer, count in kmer_counts.items():
           print(f"{kmer}: {count}")
Enter the desired k-mer size:      3
GAT:   470
ATT:   748
TTT:   896
TTA:   806
TAA:   636
AAG:   520
AGT:   504
                                            3
GTG:   568
TGA:   611
GAA:   391
AAT:   626
ATA:   486
TAG:   405
AGC:   343
GCT:   578
CTT:   741
TTG:   840
TGG:   587
GGC:   331
CTA:   564
TAT:   716
ATC:   349
TCT:   666
CTC:   423
TCA:   504
CAC:   422
ACT:   641
TTC:   584
TCC:   332
CCC:   226
CCT:   418
TCG:   191
CGT:   269
GTT:   741
TGC:   622
GCA:   453
CAG:   422
AGA:   437
AAC:   446
ACG:   200
CGA:   137
AAA:   589
GCC:   246
CTG:   575
TGT:   885
GCG:   195
GTA:   468
GTC:   337
GGT:   456
GGG:   162
GGA:   281
CAT:   493
AGG:   356
GAC:   313
ACA:   563
             4
     ATG:   722
     CAA:   565
     TAC:   566
     GAG:   293
     CGG:   125
     CCG:   121
     ACC:   343
     CGC:   176
     CCA:   382
     Question 2
     Solution
     Extracting sequences from Fasta file
[25]: import pandas as pd
      from Bio import SeqIO
      with open('sequences2.fasta') as fasta_file:
          identifiers = []
          for seq_record in SeqIO.parse(fasta_file, 'fasta'):   # (generator)
              identifiers.append(str(seq_record.seq))
     Edit distance calculation using function
[20]: def edit_distance(s, t):
          m, n = len(s), len(t)
          dp = [[0] * (n + 1) for _ in range(m + 1)]
          for i in range(m + 1):
              dp[i][0] = i
          for j in range(n + 1):
              dp[0][j] = j
          for i in range(1, m + 1):
              for j in range(1, n + 1):
                  if s[i - 1] == t[j - 1]:
                      cost=0
                  else:
                      cost=1
                  dp[i][j] = min(dp[i - 1][j] + 1,dp[i][j - 1] + 1,dp[i - 1][j - 1] +␣
       ↪cost)
          return dp[m][n]
      # Sample Case
      s = "PLEASANTLY"
      t = "MEANLY"
      print()
      print("s:", identifiers[2])
                                                5
      print()
      print("t:", identifiers[3])
      print("Edit Distance :",edit_distance(identifiers[2], identifiers[3]))
     s: DVTPVDQYMCGVDGKPISAYAFLMAKDGITKLADVEADVAARADDEGFITLKNNLYRLVWHVERKDVPYPKQSIFTI
     NSVVQKDGVENTPPHYFTLGCKILTLTPRNKWSGVSDLSLKQKLLYTFYGKESLENPTYIYHSAFIECGSCGNDSWLTGN
     AIQGFACGCGASYTANDVEVQSSGMIKPNALLCATCPFAKGDSCSSNCKHSVAQLVSYLSERCNVIADSKSFTLIFGGVA
     YAYFGCEEGTMYFVPRAKSVVSRIGDSIFTGCTGSWNKVTQIANMFLEQTQHSLNFVGEFVVNDVVLAILSGTTTNVDKI
     RQLLKGVTLDKLRDYLADYDVAVTAGPFMDNAINVGGTGLQYAAITAPYVVLTGLGESFKKVATIPYKVCNSVKDTLTYY
     AHSVLYRVFPYDMDSGVSSFSELLFDCVDLSVASTYFLVRLLQDKTGDFMSTIITSCQTAVSKLLDTCFEATEATFNFLL
     DLAGLFRIFLRNAYVYTSQGFVVVNGKVSTLVKQVLDLLNKGMQLLHTKVSWAGSNISAVIYSGRESLIFPSGTYYCVTT
     KAKSVQQDLDVILPGEFSKKQLGLLQPTDNSTTVSVTVSSNMVETVVGQLEQTNMHSPDVIVGDYVIISEKLFVRSKEED
     GFAFYPACTNGHAVPTLFRLKGG
     t: APTWFNALRDFTLKGYVLATIIVFLCAVLMYLCLPTFSMVPVEFYEDRILDFKVLDNGIIRDVNPDDKCFANKHRSF
     TQWYHEHVGGVYDNSITCPLTVAVIAGVAGARIPDVPTTLAWVNNQIIFFVSRVFANTGSVCYTPIDEIPYKSFSDSGCI
     LPSECTMFRDAEGRMTPYCHDPTVLPGAFAYSQMRPHVRYDLYDGNMFIKFPEVVFESTLRITRTLSTQYCRFGSCEYAQ
     EGVCITTNGSWAIFNDHHLNRPGVYCGSDFIDIVRRLAVSLFQPITYFQLTTSLVLGIGLCAFLTLLFYYINKVKRAFAD
     YTQCAVIAVVAAVLNSLCICFVASIPLCIVPYTALYYYATFYFTNEPAFIMHVSWYIMFGPIVPIWMTCVYTVAMCFRHF
     FWVLAYFSKKHVEVFTDGKLNCSFQDAASNIFVINKDTYAALRNSLTNDAYSRFLGLFNKYKYFSGAMETAAYREAAACH
     LAKALQTYSETGSDLLYQPPNCSITSGVLQ
     Edit Distance : 519
     Question 3
     Solution:
     Extracting sequences from FASTA file
[ ]: import pandas as pd
     from Bio import SeqIO
     with open('sequences3.fasta') as fasta_file: # Will close handle cleanly
         identifiers = []
     #read Sequence from FASTA file
         for seq_record in SeqIO.parse(fasta_file, 'fasta'): # (generator)
             identifiers.append(str(seq_record.seq))
     Function to compute the aligned sequence based on the data collected in trace-
           back matrix
[17]: def compute_aligned_sequences(traceback,s,t):
          aligned_s = ""
          aligned_t = ""
          i, j = len(s), len(t)
          while i > 0 or j > 0:
                                               6
              # Substitution
              if i > 0 and j > 0 and traceback[i][j] == "S":
                  aligned_s = s[i - 1] + aligned_s
                  aligned_t = t[j - 1] + aligned_t
                  i -= 1
                  j -= 1
              # Deletion from t sequence
              elif i > 0 and traceback[i][j] == "D":
                  aligned_s = s[i - 1] + aligned_s
                  aligned_t = "-" + aligned_t
                  i -= 1
              #Insertion to t sequence
              elif j > 0 and traceback[i][j] == "I":
                  aligned_s = "-" + aligned_s
                  aligned_t = t[j - 1] + aligned_t
                  j -= 1
          return aligned_s,aligned_t
     6.0.3 Computing the edit distance and forming the traceback matrix
[21]: def compute_optimal_alignment(s, t):
          m, n = len(s), len(t)
          # Initialize the dynamic programming matrix for computing edit distance
          dp = [[0 for _ in range(n + 1)] for _ in range(m + 1)]
          # Initialize traceback matrix to help find out the aligned sequences by␣
       ↪capturing the operations at each step
          traceback = [[""] * (n + 1) for _ in range(m + 1)]
          #dp[i][j] refers to computing alignment with first i elements from sequence␣
       ↪s and first j elements from sequence t
          for i in range(1, m + 1):
              dp[i][0] = i*2
              traceback[i][0] = "D" # Deletion
          for j in range(1, n + 1):
              dp[0][j] = j*2
              traceback[0][j] = "I"    # Insertion
          for i in range(1, m + 1):
              for j in range(1, n + 1):
                                               7
                  # Match corresponds to a penalty of 0, mismatch corresponds to a␣
       ↪penalty of 3
                  penalty = 0 if s[i - 1] == t[j - 1] else 3
                  # Gap corresponds to a penalty of 2
                  delete_penalty = dp[i - 1][j] + 2
                  insert_penalty = dp[i][j - 1] + 2
                  substitute_penalty = dp[i - 1][j - 1] + penalty
                  # Find the minimum penalty among deletion, insertion, and␣
       ↪substitution
                  min_penalty = min(delete_penalty, insert_penalty,␣
       ↪substitute_penalty)
                  # Update the traceback matrix to keep track of the optimal path
                  if min_penalty == delete_penalty:
                      traceback[i][j] = "D" # Deletion
                  elif min_penalty == insert_penalty:
                      traceback[i][j] = "I" # Insertion
                  else:
                      traceback[i][j] = "S" # Substitution
                  dp[i][j] = min_penalty
           return dp[m][n], traceback
     Printing the edit distance followed by two augmented strings s and t repre-
            senting an optimal alignment of s and t.
[23]: file_path = 'sequences3.fasta'
      sequence1 = identifiers[0]
      sequence2 = identifiers[1]
      edit_distance, traceback = compute_optimal_alignment(sequence1, sequence2)
      aligned_sequence1,␣
       ↪aligned_sequence2=compute_aligned_sequences(traceback,sequence1,sequence2)
      print("Edit Distance:", edit_distance)
      print()
      print("s:", sequence1)
      print("s':", aligned_sequence1)
      print()
      print("t:", sequence2)
      print("t':", aligned_sequence2)
                                               8
       Edit Distance: 358
       s: MPIPPLRKMLGIGGDRTEKLIPGMELSNWLPGGTSTTLELDPKQHSHSGLLRMASFGSMKMAPLMLLQLLGRGTLTM
       IQLLLHNSRPVLSFLKTSTLRGLEAIVNHLQEPLA
       s': MPIPPLRKMLGIGGD-RT----EK------LI-P--GM----E-LSNWLPGGTSTT------L---E-LD--PKQ
       ---HS-HS-GL---L--R--------M----A-SF---GSM-------K---MA-PLML-L----Q--LL---GRGTL--
       TMIQLLLHN-SRPVLS--FL---KTSTLRGLEAIVNHLQEPLA--
       t: MSFVAGVTAQGARGTYRAALNSEKHQDHVSLTVPLCGSGNLVEKLSPWFMDGENAYEVVKAMLLKKEPLLYVPIRLA
       GHTRHLPGPRVYLVERLIACENPFMVNQLAYSSSANGSLVGTTLQGKPIGMFFPYDIELVTGKQNILLRKYGRGGYHYTP
       FHYERDNTSCPEWMDDFEADPKGKYAQNLLKKLIGG
       t': MSFVAGVTAQGARGTYRAALNSEKHQDHVSLTVPLCGSGNLVEKLSPWFMDGENAYEVVKAMLLKKEPLLYVPIRL
       AGHTRHLPGPRVYLVERLIACENPFMVNQLAYSSSANGSLVGTTLQGKPIGMFFPYDIELVTGKQNILLRKYGRGGYHYT
       PFHYERDNTSCPEWMDDFEADPK-----GKYAQ-NLLKK-LIGG
[ ]: