function_solutions
October 18, 2019
1     Table of Contents
1 Functions
   1.1 Exercise 1
   1.2 Exercise 2
   1.3 Exercise 3
   1.4 Exercise 4
   1.5 Exercise 5
   1.6 Exercise 6
   1.7 Exercise 7
   1.8 Exercise 8
   1.9 Exercise 9
   1.10 Exercise 10
   1.11 Exercise 11
   1.12 Exercise 12
   1.13 Exercise 13
   1.14 Exercise 14
   2 Case study
   2.1 Exercise 15
   2.2 Exercise 16
   2.3 Exercise 17
   2.4 Exercise 18
   2.5 Exercise 19
   2.6 Exercise 20
   2.7 Exercise 21
   2.8 A less readable way to write it
2     Functions
2.1   Exercise 1
In [3]: def print_line():
            print("This is a function!")
         print_line()
This is a function!
                                                1
2.2   Exercise 2
In [4]: def print_line():
            return "This is a function!"
         result = print_line()
         print(result)
         print(result[0])
This is a function!
T
2.3   Exercise 3
In [5]: def greet(name1, name2):
            return "Hello {} and {}!".format(name1, name2)
         print(greet("Björn", "Dag"))
Hello Björn and Dag!
2.4   Exercise 4
In [6]: def greet(name1="?", name2="?"):
            return "Hello {} and {}!".format(name1, name2)
         print(greet())
         print(greet("Petr"))
         print(greet("Björn", "Dag"))
Hello ? and ?!
Hello Petr and ?!
Hello Björn and Dag!
2.5   Exercise 5
In [7]: def multiply(nbr1, nbr2):
            return nbr1 * nbr2
         def add(nbr1, nbr2):
             return nbr1 + nbr2
         print(multiply(2,3))
         print(add(2,3))
6
5
                                           2
2.6   Exercise 6
In [8]: def multiply(nbr1, nbr2):
            return nbr1 * nbr2
         def add(nbr1, nbr2):
             return nbr1 + nbr2
         def calculate(nbr1, nbr2, operation):
             if operation == "add":
                 return add(nbr1, nbr2)
             elif operation == "multiply":
                 return multiply(nbr1, nbr2)
             else:
                 raise Exception("Unknown operation option: {}".format(operation))
         print(calculate(2, 3, operation="add"))
         print(calculate(2, 3, operation="multiply"))
5
6
2.7   Exercise 7
In [9]: def get_gc(seq_raw):
            seq = seq_raw.upper() # Handle both lower- and upper- case
            gc_count = seq.count("C") + seq.count("G")
            at_count = seq.count("A") + seq.count("T")
            return gc_count / (at_count + gc_count)
         print(get_gc("CACAGGTT"))
         print(get_gc("CAG"))
0.5
0.6666666666666666
2.8   Exercise 8
In [10]: def many_hi(number):
             for _ in range(number):
                 print("Hi!")
          many_hi(4)
Hi!
Hi!
Hi!
                                         3
Hi!
2.9    Exercise 9
In [11]: def many_hi(number):
             return ["Hi!"] * number
           hi_list = many_hi(6)
           print(hi_list)
['Hi!', 'Hi!', 'Hi!', 'Hi!', 'Hi!', 'Hi!']
2.10    Exercise 10
In [12]: def many_hi(number, word="Hi"):
             return ["{}!".format(word)] * number
           hi_list = many_hi(3)
           print(hi_list)
           word_list = many_hi(4, word="Halloa")
           print(word_list)
['Hi!', 'Hi!', 'Hi!']
['Halloa!', 'Halloa!', 'Halloa!', 'Halloa!']
2.11    Exercise 11
In [13]: seq_dict = {
             "header1.1": "ATGCTAGCTAGCTAGCTACG",
             "header1.2": "ACGTAGCTAGCTAGCAC",
             "header2.1": "AGCTAGCTAGCTATTATCTACT"
         }
         seq_dict
Out[13]: {'header1.1': 'ATGCTAGCTAGCTAGCTACG',
          'header1.2': 'ACGTAGCTAGCTAGCAC',
          'header2.1': 'AGCTAGCTAGCTATTATCTACT'}
2.12    Exercise 12
In [14]: n = 1
         for key in seq_dict.keys():
             value = seq_dict[key]
             print("Entry {} has header: {} and sequence: {}".format(n, key, value))
             n += 1
                                          4
Entry 1 has header: header1.1 and sequence: ATGCTAGCTAGCTAGCTACG
Entry 2 has header: header1.2 and sequence: ACGTAGCTAGCTAGCAC
Entry 3 has header: header2.1 and sequence: AGCTAGCTAGCTATTATCTACT
2.13   Exercise 13
In [15]: def load_fasta(filepath):
              seqs = dict()
              with open(filepath, 'r') as in_fh:
                  key = None
                  for line in in_fh:
                      line = line.rstrip()
                      if line.startswith('>'):
                          key = line[1:]
                      else:
                          seq = line
                          seqs[key] = seq
              return seqs
          fasta_dict = load_fasta("test.fa")
          print(fasta_dict)
{'header1.1': 'ATGCTAGCTAGCTAGCTACG', 'header1.2': 'ACGTAGCTAGCTAGCAC', 'header2.1': 'AGCTAGCTA
2.14   Exercise 14
In [16]: def calcGC(seq_raw):
             seq = seq_raw.upper() # Handle both lower- and upper- case
             gc_count = seq.count("C") + seq.count("G")
             at_count = seq.count("A") + seq.count("T")
             return gc_count / (at_count + gc_count)
          sequenceList = list(fasta_dict.values())
          for sequence in sequenceList:
              gc = calcGC(sequence)
              print('{}: {}'.format(sequence, gc))
ATGCTAGCTAGCTAGCTACG: 0.5
ACGTAGCTAGCTAGCAC: 0.5294117647058824
AGCTAGCTAGCTATTATCTACT: 0.36363636363636365
                                         5
3     Case study
3.1   Exercise 15
In [ ]:
3.2   Exercise 16
In [18]: in_fp = 'data/fungus.gff'
         out_fp = 'data/fungus_renamed.gff'
          with open(in_fp, 'r') as in_fh, open(out_fp, 'w') as out_fh:
              lines = 0
              for line in in_fh:
                  line = line.rstrip()
                  lines += 1
                  if line.startswith('#'):
                      print(line, file=out_fh)
                  else:
                      fields = line.split('\t')
                      fields[0] = 'scaffold_{}'.format(fields[0][3:])
                      merged_line = '\t'.join(fields)
                      print(merged_line, file=out_fh)
              print('{} lines written to file {}'.format(lines, out_fp))
208425 lines written to file data/fungus_renamed.gff
3.3   Exercise 17
In [19]: def parse_fasta_to_dict(filepath):
              seqs = dict()
              with open(filepath, 'r') as in_fh:
                  key = None
                  for line in in_fh:
                      line = line.rstrip()
                      if line.startswith('>'):
                          key = line[1:]
                      else:
                          seq = line
                          seqs[key] = seq
              return seqs
          fasta_dict = parse_fasta_to_dict("data/fungusAssembly.single.fna")
          print(len(fasta_dict.keys()))
          print(len(fasta_dict['scaffold_10']))
                                         6
2681
1262682
3.4   Exercise 18
In [20]: def get_feature_coordinates(annot_gff, target_pattern):
              feature_coords_list = list()
              with open(annot_gff, 'r') as in_fh:
                  for line in in_fh:
                      if not line.startswith('##'):
                          line = line.rstrip()
                          fields = line.split('\t')
                          chrom_nbr = fields[0]
                          id_type = fields[2]
                          start_pos = int(fields[3])
                          end_pos = int(fields[4])
                          if id_type == target_pattern:
                              feature_coords_list.append((chrom_nbr, start_pos, end_pos))
              return feature_coords_list
          feature_coords = get_feature_coordinates('data/fungus_renamed.gff', 'CDS')
          print(feature_coords[0])
          print(feature_coords[1])
('scaffold_1', 1335, 1582)
('scaffold_1', 1640, 1947)
3.5   Exercise 19
In [21]: def extract_features(genome_dict, feature_coords):
              feature_dict = dict()
              for coord_tuple in feature_coords:
                  scaffold = coord_tuple[0]
                  start_pos = coord_tuple[1]
                  end_pos = coord_tuple[2]
                    seq = genome_dict[scaffold][start_pos-1:end_pos]
                    feature_dict['>{} {}-{}'.format(scaffold, start_pos, end_pos)] = seq
              return feature_dict
          genome_path = 'data/fungusAssembly.single.fna'
          annot_path = 'data/fungus_renamed.gff'
                                           7
          genome_dict = parse_fasta_to_dict(genome_path)
          feature_coords = get_feature_coordinates(annot_path, 'CDS')
          feature_fasta_dict = extract_features(genome_dict, feature_coords)
          print(feature_fasta_dict[">scaffold_1 1335-1582"])
          print(len(feature_fasta_dict[">scaffold_1 1335-1582"]))
ATGTGTAccccatcaatatcctcgcaaaatattcccagcgatgacaagagccgttcgagttcgcagggctccgaacgatgtggagagacctcgct
248
3.6   Exercise 20
In [22]: def write_fasta_dict(fasta_dict, out_fp):
             with open(out_fp, 'w') as out_fh:
                 for header in sorted(fasta_dict.keys()):
                     seq = fasta_dict[header]
                     print('{}\n{}'.format(header, seq), file=out_fh)
             print('Written {} entries to {}'.format(len(fasta_dict), out_fp))
          write_fasta_dict(feature_fasta_dict, 'data/out_dict.fa')
Written 88016 entries to data/out_dict.fa
3.7   Exercise 21
In [23]: fasta_dict = dict()
         with open('data/fungusAssembly.single.fna', 'r') as in_fh:
             key = None
             for line in in_fh:
                 line = line.rstrip()
                 if line.startswith('>'):
                     key = line[1:]
                 else:
                     seq = line
                     fasta_dict[key] = seq
          feature_coords = list()
          with open('data/fungus_renamed.gff', 'r') as in_fh:
              for line in in_fh:
                  if not line.startswith('##'):
                      line = line.rstrip()
                      fields = line.split('\t')
                      chrom_nbr = fields[0]
                      id_type = fields[2]
                      start_pos = int(fields[3])
                                         8
                         end_pos = int(fields[4])
                         if id_type == 'CDS':
                             feature_coords.append((chrom_nbr, start_pos, end_pos))
           feature_dict = dict()
           for coord_tuple in feature_coords:
               scaffold = coord_tuple[0]
               start_pos = coord_tuple[1]
               end_pos = coord_tuple[2]
               seq = genome_dict[scaffold][start_pos-1:end_pos]
               feature_dict['>{} {}-{}'.format(scaffold, start_pos, end_pos)] = seq
           with open('out_dict.fa', 'w') as out_fh:
               for header in sorted(fasta_dict.keys()):
                   seq = fasta_dict[header]
                   print('{}\n{}'.format(header, seq), file=out_fh)
           print(feature_fasta_dict[">scaffold_1 1335-1582"])
           print(len(feature_fasta_dict[">scaffold_1 1335-1582"]))
ATGTGTAccccatcaatatcctcgcaaaatattcccagcgatgacaagagccgttcgagttcgcagggctccgaacgatgtggagagacctcgct
248
3.8   A less readable way to write it
Everyone to their own, but for me this is not a recommended way, as it becomes very difficult to
follow the logic for other people, or yourself two weeks later, or even yourself while writing it (I
struggled while writing this!)
In [24]: d = dict()
         with open('data/fungusAssembly.single.fna', 'r') as file:
             key = None
             for x in file:
                 x = x.rstrip()
                 if x.startswith('>'):
                     key = x[1:]
                 else:
                     x1 = x
                     d[key] = x1
           mylist = list()
           with open('data/fungus_renamed.gff', 'r') as file:
               for l in file:
                   if not l.startswith('##'):
                       l = l.rstrip()
                                                 9
                    f = l.split('\t')
                    c = f[0]
                    t = f[2]
                    p1 = int(f[3])
                    p2 = int(f[4])
                    if t == 'CDS':
                        mylist.append((c, p1, p2))
        fd = dict()
        for ct in mylist:
            s = ct[0]
            p1 = ct[1]
            p2 = ct[2]
            s2 = d[s][p1-1:p2]
            fd['>{} {}-{}'.format(s, p1, p2)] = s2
        with open('out_dict.fa', 'w') as file2:
            for h in sorted(fd.keys()):
                s = fd[h]
                print('{}\n{}'.format(h, s), file=file2)
        print(fd[">scaffold_1 1335-1582"])
        print(len(fd[">scaffold_1 1335-1582"]))
ATGTGTAccccatcaatatcctcgcaaaatattcccagcgatgacaagagccgttcgagttcgcagggctccgaacgatgtggagagacctcgct
248
                                        10