forked from BIONF/taXaminer
-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathclasses.py
More file actions
933 lines (681 loc) · 33 KB
/
Copy pathclasses.py
File metadata and controls
933 lines (681 loc) · 33 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
#!/usr/bin/env python
"""Classes for taXaminer
"""
__author__ = "Simonida Zehr, Freya Arthen"
__version__ = "0.6.0"
import numpy as np
from operator import itemgetter
class Config:
"""Object to hold all configuration options
"""
def __init__(self, cfg_dict):
"""
Constructs configuration object.
Parameters
----------
cfg_dict : dict
dictionary holding the preprocessed configuration parameters
"""
self.fasta_path = cfg_dict.get('fasta_path')
self.gff_path = cfg_dict.get('gff_path')
self.output_path = cfg_dict.get('output_path')
self.taxon_id = cfg_dict.get('taxon_id')
self.gff_ta_path = cfg_dict.get('gff_ta_path')
self.threads = cfg_dict.get('threads')
self.read_paths = cfg_dict.get('read_paths')
self.bam_paths = cfg_dict.get('bam_paths')
self.pbc_paths = cfg_dict.get('pbc_paths')
self.cov_set_exists = cfg_dict.get('cov_set_exists')
self.include_coverage = cfg_dict.get('include_coverage')
self.compute_coverage = cfg_dict.get('compute_coverage')
self.min_insert = cfg_dict.get('min_insert')
self.max_insert = cfg_dict.get('max_insert')
self.read_orientation = cfg_dict.get('read_orientation')
self.proteins_path = cfg_dict.get('proteins_path')
self.extract_proteins = cfg_dict.get('extract_proteins')
self.use_phase_info = cfg_dict.get('use_phase_info')
self.assignment_mode = cfg_dict.get('assignment_mode')
self.quick_mode_search_rank = cfg_dict.get('quick_mode_search_rank')
self.quick_mode_match_rank = cfg_dict.get('quick_mode_match_rank')
self.tax_assignment_path = cfg_dict.get('tax_assignment_path')
self.compute_tax_assignment = cfg_dict.get('compute_tax_assignment')
self.database_path = cfg_dict.get('database_path')
self.taxon_exclude = cfg_dict.get('taxon_exclude')
self.exclusion_rank = cfg_dict.get('exclusion_rank')
self.update_plots = cfg_dict.get('update_plots')
self.num_groups_plot = cfg_dict.get('num_groups_plot')
self.merging_labels = cfg_dict.get('merging_labels')
self.output_pdf = cfg_dict.get('output_pdf')
self.output_png = cfg_dict.get('output_png')
self.include_pseudogenes = cfg_dict.get('include_pseudogenes')
self.gff_preset = cfg_dict.get('gff_preset')
self.input_variables = cfg_dict.get('input_variables')
self.perform_parallel_analysis = cfg_dict.get('perform_parallel_analysis')
self.num_pcs = cfg_dict.get('num_pcs')
self.coverage_cutoff_mode = cfg_dict.get('coverage_cutoff_mode')
self.perform_kmeans = cfg_dict.get('perform_kmeans')
self.kmeans_k = cfg_dict.get('kmeans_k')
self.perform_hclust = cfg_dict.get('perform_hclust')
self.hclust_k = cfg_dict.get('hclust_k')
self.perform_mclust = cfg_dict.get('perform_mclust')
self.mclust_k = cfg_dict.get('mclust_k')
self.perform_dbscan = cfg_dict.get('perform_dbscan')
self.dbscan_groups = cfg_dict.get('dbscan_groups')
self.custom_eps = cfg_dict.get('custom_eps')
self.custom_minPts = cfg_dict.get('custom_minPts')
self.script_dir = cfg_dict.get('script_dir')
self.usr_cfg_path = cfg_dict.get('usr_cfg_path')
self.cfg_path = cfg_dict.get('cfg_path')
# GFF parsing rule
self.gff_source = cfg_dict.get('gff_source')
self.gff_gene_type = cfg_dict.get('gff_gene_type')
self.gff_transcript_type = cfg_dict.get('gff_transcript_type') \
if 'gff_transcript_type' in cfg_dict.keys() \
else cfg_dict.get('gff_fasta_header_type')
self.gff_cds_type = cfg_dict.get('gff_cds_type')
self.gff_fasta_header_type = cfg_dict.get('gff_fasta_header_type') \
if 'gff_fasta_header_type' in cfg_dict.keys() \
else cfg_dict.get('gff_transcript_type')
self.gff_fasta_header_attr = cfg_dict.get('gff_fasta_header_attr')
self.gff_connection = cfg_dict.get('gff_connection')
self.gff_parent_child_types = cfg_dict.get('gff_parent_child_types')
self.gff_parent_attr = cfg_dict.get('gff_parent_attr')
self.gff_child_attr = cfg_dict.get('gff_child_attr')
self.gff_gene_attr = cfg_dict.get('gff_gene_attr')
class Assembly:
"""Object holding information regarding the assembly
"""
def __init__(self, gff_path, fasta_path, pbc_paths, output_path):
self.gff_path = gff_path
self.fasta_path = fasta_path
self.pbc_paths = pbc_paths
self.num_pbc = max(1, len(pbc_paths))
self.output_path = output_path
self.output_dir = self.output_path + "gene_info"
self.total_z_score_vector = None
self.contigs = {}
self.all_contig_lengths = []
self.genes = {}
self.transcripts = {}
self.geneless_contigs = {} # dictionary of contigs without genes
self.contigs_without_cov = {} # dictionary to which all contigs without cov info will be moved to
self.genes_without_cov = {} # dictionary to which all genes without cov info will be moved to
self.partial_genes = {}
def get_gff_path(self):
return self.gff_path
def get_fasta_path(self):
return self.fasta_path
def get_pbc_paths(self):
return self.pbc_paths
def get_num_pbc(self):
return self.num_pbc
def get_output_path(self):
return self.output_path
def get_output_dir(self):
return self.output_dir
def get_total_z_score_vector(self):
return self.total_z_score_vector
def set_total_z_score_vector(self, total_z_score_vector):
self.total_z_score_vector = total_z_score_vector
def add_contig(self, contig_name, contig):
self.contigs[contig_name] = contig
def remove_contig(self, contig):
del self.contigs[contig]
def get_contigs(self):
return self.contigs
def get_contig(self, contig):
return self.contigs.get(contig)
def add_contig_length(self, contig_length):
self.all_contig_lengths.append(contig_length)
def get_all_contig_lengths(self):
return self.all_contig_lengths
def add_gene(self, gene_name, gene):
self.genes[gene_name] = gene
def remove_gene(self, gene):
del self.genes[gene]
def get_genes(self):
return self.genes
def get_gene(self, gene):
return self.genes.get(gene)
def add_geneless_contig(self, contig_name, contig):
self.geneless_contigs[contig_name] = contig
def remove_geneless_contig(self, contig):
del self.geneless_contigs[contig]
def get_geneless_contigs(self):
return self.geneless_contigs
def get_geneless_contig(self, contig):
return self.geneless_contigs[contig]
def add_contig_without_cov(self, contig_name, contig):
self.contigs_without_cov[contig_name] = contig
def remove_contig_without_cov(self, contig):
del self.contigs_without_cov[contig]
def get_contigs_without_cov(self):
return self.contigs_without_cov
def get_contig_without_cov(self, contig):
return self.contigs_without_cov[contig]
def add_gene_without_cov(self, gene_name, gene):
self.genes_without_cov[gene_name] = gene
def remove_gene_without_cov(self, gene):
del self.genes_without_cov[gene]
def get_genes_without_cov(self):
return self.genes_without_cov
def get_gene_without_cov(self, gene):
return self.genes_without_cov[gene]
def add_partial_gene(self, gene_name, gene):
if gene_name in self.partial_genes.keys():
self.partial_genes[gene_name].append(gene)
else:
self.partial_genes[gene_name] = [gene]
def remove_partial_gene(self, gene):
del self.partial_genes[gene]
def get_partial_genes(self):
return self.partial_genes
def get_partial_gene(self, gene):
return self.partial_genes[gene]
########################## CONTIG #################################
class Contig:
"""Object to store information for each contig
"""
def __init__(self, name, length, a):
self.name = name
self.length = length
self.centre_corrector = 0 # see method set_centre_corrector() for detailed explanation
self.set_centre_corrector()
self.per_base_coverages = {pbc_index: [] for pbc_index in a.get_pbc_paths().keys()} # array that holds the coverage per base in this contig
# note that the element at index i is the coverage at base i+1
# (indexing in python is 0-based but sequences are 1-based)
self.coverage = {pbc_index: np.nan for pbc_index in a.get_pbc_paths().keys()}
self.coverage_sd = {pbc_index: np.nan for pbc_index in a.get_pbc_paths().keys()}
# set that will hold all positions of Ns in this contig (1-based)
self.positions_of_Ns = None # used in --> add_base_coverage() function
self.genes = [] # fill with gene names in order to be able to access them from all_genes
self.gene_coverage_mean = {pbc_index: np.nan for pbc_index in a.get_pbc_paths().keys()}
self.gene_coverage_sd = {pbc_index: np.nan for pbc_index in a.get_pbc_paths().keys()}
self.gene_lengths_mean = None
self.gene_lengths_sd = None
self.gc_content = None
self.gene_gc_mean = None
self.gene_gc_sd = None
self.tetranuc_freqs = None
self.trinuc_freqs = None
self.dinuc_freqs = None
self.z_score_vector = None
def get_name(self):
return self.name
def get_length(self):
return self.length
def get_length_of_covered_bases(self, pbc_index):
"""Return how many bases are covered."""
pbc_without_nans = [cov for cov in self.per_base_coverages[pbc_index] if not np.isnan(cov)]
return len(pbc_without_nans)
def get_centre(self):
return ((self.length / 2) + 0.5) # returns the number of the base at the centre
# or, at even contig length, the pos. inbetween
def set_centre_corrector(self):
""" This method sets self.centre_corrector.
It is needed for the calculations of the absolute gene positions.
These are calculated with respect to the central base in the contig.
For a contig of odd length, there is an actual central base, e.g.
1-----7-----13
The gene positions are calculated using 1&7 and 7&13, for the left and right part, resp.
For a contig of even length, there is no actual central base, e.g.
1----6[6.5]7----12 (here the self-centre = 6.5)
The gene positions will be calculated using 1&6 and 7&12.
That's why we want self-centre_corrector (which by default is set to 0)
to be set to 0.5 (in order to add / substract it from self.centre)
when we have a contig of even length. """
if (self.length % 2 == 0): # if contig is of even length
self.centre_corrector = 0.5
def add_gene(self, gene):
self.genes.append(gene)
def get_genes(self):
return self.genes
def get_number_of_genes(self):
return len(self.genes)
def set_gene_positions(self, a):
"""Computes the absolute positions for all genes of this contig
Scaling to 1-----(0.5)-----0-----(0.5)-----1
Also overwrites self.genes array to hold the genes in their order on this contig.
"""
# get the middle base of this contig
contig_centre = self.get_centre()
# list that holds tuples of (gene, gene_start_pos)
# start_pos info will be used to sort the corresponding genes for this contig
tmp_gene_list = []
# for each gene name in the array that holds all genes of this contig
for gene in self.genes: # this fills tmp_list with the tuples (gene, gene_start_pos)
gene_start_pos = a.get_gene(gene).get_start_pos()
gene_centre = a.get_gene(gene).get_centre()
gene_end_pos = a.get_gene(gene).get_end_pos()
# if the gene is located in the left half of the contig
if (gene_centre <= contig_centre):
# (for even length contigs:
# correct the contig reference to the base left to the centre)
# (nothing happens for odd contig lengths)
corr_contig_centre = contig_centre - self.centre_corrector
# set gene position to a value between 1 (starts contig start)
# and > 0 (starts near contig centre)
gene_position = (gene_start_pos - corr_contig_centre) / (1 - corr_contig_centre)
# elif (gene_centre == contig_centre):
# gene_position = 0
# this has been omitted (and < contig_centre changed to <= contig_centre)
# because a central gene that makes up most of the contig
# would get the position 0 (even though it most likely ends near both termini)
# --> distorted depiction of actual distance to contig ends
# if the gene is located in the right half of the contig
elif (gene_centre > contig_centre):
# (for even length contigs:
# correct the contig reference to the base right to the centre)
# (nothing happens for odd contig lengths)
corr_contig_centre = contig_centre + self.centre_corrector
# set gene position to a value between > 0 (ends near the centre of the contig)
# and 1 (ends at contig end)
gene_position = (gene_end_pos - corr_contig_centre) / (self.length - corr_contig_centre)
# note: self.length is equal to the number of the base at the contig end
# if none of the above statements evaluates to true
# either gene_centre or contig_centre is erroneous
else:
print("""ERROR IN GENE POS CALCULATION. \n
gene = """, gene, ", gene_centre = ", gene_centre, ", contig_centre= ", contig_centre)
a.get_gene(gene).set_absolute_pos(gene_position)
# append gene and its start pos info to this list
tmp_gene_list.append((gene, gene_start_pos))
# sort list holding (gene, start_pos) by second item in the tuple, i.e. gene_start_pos
sorted_gene_list = sorted(tmp_gene_list, key=itemgetter(1))
# substitute contig's gene list with list of genes sorted by start_pos
self.genes = [i[0] for i in sorted_gene_list]
def set_positions_of_Ns(self, N_positions):
self.positions_of_Ns = N_positions
def add_base_coverage(self, pbc_index, base, coverage):
"""Adds coverage info for the base i to per_base_coverages[i-1].
Note: indices of per_base_coverage are 0-based,
while coverage file is 1-based."""
# if bases are skipped in the PBC file (because no coverage is given)
# add dummy values to the coverage array (as to avoid a shift in positions)
while len(self.per_base_coverages.get(pbc_index)) < (base -1):
# if the base without coverage is an N
if base in self.positions_of_Ns:
self.per_base_coverages[pbc_index].append(np.nan) # add a NaN
# --> the base (and its missing cov) will be ignored in calculating cov metrics
else:
# if the base is not an N
# add a 0 --> the base will be included in the cov metrics calculation
self.per_base_coverages[pbc_index].append(0)
# check whether coverage info will be inserted at the right place
# e.g. for coverage info for base 1, we want per_base_coverages to have a length of 0
if len(self.per_base_coverages.get(pbc_index)) == (base-1):
self.per_base_coverages[pbc_index].append(coverage)
else:
print("ERROR! Unexpected base position in per base coverage info") # THROW ERROR
print("Contig: ", self.get_name(),
", Expected Base: ", (len(self.per_base_coverages.get(pbc_index))+1), ", Got Base: ", base)
def compute_own_coverage_info(self):
# since, while filling the per_base_coverage array,
# missing base coverages are substituted with NaNs
# compute mean and SD while igonring NaNs
self.coverage = {pbc_index: (np.nanmean(coverages) if coverages else np.nan) for pbc_index, coverages in self.per_base_coverages.items()}
self.coverage_sd = {pbc_index: (np.nanstd(coverages) if coverages else np.nan) for pbc_index, coverages in self.per_base_coverages.items()}
def no_coverage_info(self):
if sum([len(cov) if cov else 0 for cov in self.per_base_coverages.values()]) == 0:
return True
else:
return False
def get_coverage(self, pbc_index):
return self.coverage.get(pbc_index)
def get_coverage_sd(self, pbc_index):
return self.coverage_sd.get(pbc_index)
def compute_gene_coverage_info(self, a):
"""Compute the coverage mean & SD and set them
for each Gene object associated with this contig"""
for gene_name in self.genes:
gene = a.get_gene(gene_name)
# because bp are 1-based but python lists are 0-based:
cov_start = gene.get_start_pos() - 1
# because list slicing ([cov_start:cov_end]) stops BEFORE the given index:
cov_end = gene.get_end_pos()
# (one would want to have gene_end+1, but since the array is 0-based
# it's already "shifted" +1 to the right
# since, while filling the per_base_coverage array,
# missing base coverages are substituted with NaNs
# get array WITHOUT NaNs
gene_pbc_without_nans = {}
for pbc_index in a.get_pbc_paths().keys():
gene_pbc_without_nans[pbc_index] = [cov for cov in self.per_base_coverages[pbc_index][cov_start:cov_end]
if not np.isnan(cov)]
gene_coverage = {pbc_index: (np.mean(coverages) if coverages else np.nan) for pbc_index, coverages in gene_pbc_without_nans.items()}
gene_coverage_sd = {pbc_index: (np.std(coverages, ddof=1) if coverages else np.nan) for pbc_index, coverages in gene_pbc_without_nans.items()}
# set the coverage info for the gene
gene.set_coverage_info(gene_coverage, gene_coverage_sd)
# also set info on how many bases are covered in this gene
gene.set_length_of_covered_bases({pbc_index: len(coverages) for pbc_index, coverages in gene_pbc_without_nans.items()})
def set_gene_coverage_mean_and_sd(self, gene_cov_mean, gene_cov_sd):
self.gene_coverage_mean = gene_cov_mean
self.gene_coverage_sd = gene_cov_sd
def get_gene_coverage_mean(self, pbc_index):
return self.gene_coverage_mean[pbc_index]
def get_gene_coverage_sd(self, pbc_index):
return self.gene_coverage_sd[pbc_index]
def get_coverage_array(self, start_pos, end_pos):
"""Return the coverage for all bases starting with
(including) start_pos and ending with end_pos"""
return self.coverages[(start_pos-1), end_pos]
# A) why "start_pos-1"? Because sequences are 1-based
# but array indexing is 0-based
# B) why not "end_pos-1"? Because slicing stops BEFORE the given index
# (one would want to take gene_end+1, but since the array is 0-based
# it's already "shifted" +1 to the right
def set_gene_gc_mean_and_sd(self, gene_gc_mean, gene_gc_sd):
self.gene_gc_mean = gene_gc_mean
self.gene_gc_sd = gene_gc_sd
def get_gene_gc_mean(self):
return self.gene_gc_mean
def get_gene_gc_sd(self):
return self.gene_gc_sd
def compute_gene_positional_info(self, a):
"""Compute and set all positional information for this gene
i.e. absolute position, no right/left neighbour info, single-gene contig info"""
# set absolute positions of genes
if self.genes: # check for emtpy list
self.set_gene_positions(a)
# self.genes get sorted by start_pos in this step
# flag first gene as without left neighbour
first_gene = a.get_gene(self.genes[0])
first_gene.set_no_left_neighbour(1)
# flag last gene as without right neighbour
last_gene = a.get_gene(self.genes[(len(self.genes) - 1)])
last_gene.set_no_right_neighbour(1)
# if there is only one gene in self.genes
if len(self.genes) == 1:
# flag as only gene
a.get_gene(self.genes[0]).set_single_gene_info(1)
def single_gene_info(self):
"""Returns pseudobool 1 if this contig has only one gene
(0 otherwise)."""
if len(self.genes) == 1:
return 1
else:
return 0
def geneless_info(self):
"""Returns pseudobool 1 if this contig has no genes
(0 otherwise)."""
if len(self.genes) == 0:
return 1
else:
return 0
def set_gene_lengths_mean_and_sd(self, gene_lengths_mean, gene_lengths_sd):
self.gene_lengths_mean = gene_lengths_mean
self.gene_lengths_sd = gene_lengths_sd
def get_gene_lengths_mean(self):
return self.gene_lengths_mean
def get_gene_lengths_sd(self):
return self.gene_lengths_sd
def covdev_from_overall(self, mean_ref, sd_ref, pbc_index):
"""Indicates how much the contig cov deviates from mean contig cov,
in units of contig cov SD (overall)."""
# if coverage is uniform (e.g. because of mock coverage)
# there is no SD in coverage; thus deviation in SD can not be computed
if sd_ref == 0:
return np.nan
# deviation in units of 1 SD (overall)
dev_in_sd = abs(mean_ref - self.coverage.get(pbc_index)) / sd_ref
# if own cov is smaller than overall contig cov mean
if self.coverage.get(pbc_index) < mean_ref:
# return negative deviation
return -dev_in_sd
else:
return dev_in_sd
def set_gc_content(self, gc_content):
self.gc_content = gc_content
def get_gc_content(self):
return self.gc_content
def gcdev_from_overall(self, mean_ref, sd_ref):
"""Indicates how much the contig GC deviates from mean contig GC,
in units of contig GC SD (overall)."""
# deviation in units of 1 SD (overall)
dev_in_sd = abs(mean_ref - self.gc_content) / sd_ref
# if own GC is smaller than overall contig GC mean
if self.gc_content < mean_ref:
# return negative deviation
return -dev_in_sd
else:
return dev_in_sd
def set_oligofreqs(self, tetranuc_freqs, trinuc_freqs, dinuc_freqs):
self.tetranuc_freqs = tetranuc_freqs
self.trinuc_freqs = trinuc_freqs
self.dinuc_freqs = dinuc_freqs
def get_oligofreqs(self):
return self.tetranuc_freqs, self.trinuc_freqs, self.dinuc_freqs
def set_z_score_vector(self, z_score_vector):
self.z_score_vector = z_score_vector
def get_z_score_vector(self):
return self.z_score_vector
############################ GENE ###################################
class Gene:
"""
Object to store information for each gene
Attributes
----------
name : str
start_pos : str
end_pos : str
contig : str
source : str
score : str
strand : str
attributes : str
# set length
length : int
percentage_of_contig_length : str
length_of_covered_bases : str
coverage : str
coverage_sd : str
absolute_pos : str
no_left_neighbour : str
no_right_neighbour : str
single_gene : str
gc_content : str
tetranuc_freqs : str
trinuc_freqs : str
dinuc_freqs : str
z_score_vector : str
"""
def __init__(self, name, start_pos, end_pos, contig, source, score, strand, attributes, a):
self.name = name
self.start_pos = start_pos
self.end_pos = end_pos
self.contig = contig
self.source = source
self.score = score
self.strand = strand
self.attributes = attributes
# set length
self.length = self.end_pos - self.start_pos + 1
self.percentage_of_contig_length = self.compute_percentage_of_contig_length(a)
self.length_of_covered_bases = {pbc_index: np.nan for pbc_index in a.get_pbc_paths().keys()}
self.coverage = {pbc_index: np.nan for pbc_index in a.get_pbc_paths().keys()} # list of mean coverage for each cov profile
self.coverage_sd = {pbc_index: np.nan for pbc_index in a.get_pbc_paths().keys()} # dict of coverage SD for each coverage profile
self.absolute_pos = None
self.no_left_neighbour = 0 # "false" (= L neighbour exists) by default
self.no_right_neighbour = 0 # "false" (= R neighbour exists) by default
self.single_gene = 0 # "false" by default
self.gc_content = None
self.tetranuc_freqs = None
self.trinuc_freqs = None
self.dinuc_freqs = None
self.z_score_vector = None
def get_name(self): # the name equals the ID in the attributes field
return self.name # (the ID is required to be unique within the scope of the GFF)
def get_start_pos(self):
return self.start_pos
def get_end_pos(self):
return self.end_pos
def get_source(self):
return self.source
def get_contig(self):
return self.contig
def get_score(self):
return self.score
def get_strand(self):
return self.strand
def get_length(self):
return self.length
def get_attributes(self):
return self.attributes # list of all attributes
def get_centre(self):
return (self.start_pos + (self.length / 2) - 0.5)
# works for even lengths:
# e.g. if gene covers positions 3, 4, 5, 6 in the contig (centre = 4.5)
# --> length=4, half length =2 --> centre = 3+ 2 -0.5 = 4.5
# and for odd lengths:
# e.g. if gene covers positions 3, 4, 5, 6, 7 in the contig (centre = 5)
# --> length = 5, half length = 2.5 --> centre = 3 + 2.5 - 0.5 = 5
def set_coverage_info(self, coverage, coverage_sd):
self.coverage = coverage
self.coverage_sd = coverage_sd
def get_coverage(self, pbc_index):
return self.coverage.get(pbc_index)
def get_coverage_sd(self, pbc_index):
return self.coverage_sd.get(pbc_index)
def set_absolute_pos(self, position):
self.absolute_pos = position
def get_absolute_pos(self):
return self.absolute_pos
def set_no_left_neighbour(self, pseudobool):
self.no_left_neighbour = pseudobool
def get_no_left_neighbour(self):
return self.no_left_neighbour
def set_no_right_neighbour(self, pseudobool):
self.no_right_neighbour = pseudobool
def get_no_right_neighbour(self):
return self.no_right_neighbour
def get_terminal_info(self):
if self.no_left_neighbour == 1 or self.no_right_neighbour == 1:
return 1
else:
return 0
def set_single_gene_info(self, pseudobool):
self.single_gene = pseudobool
def get_single_gene_info(self):
return self.single_gene
def set_length_of_covered_bases(self, length):
self.length_of_covered_bases = length
def get_length_of_covered_bases(self, pbc_index):
return self.length_of_covered_bases.get(pbc_index)
def compute_percentage_of_contig_length(self, a):
contig_length = a.get_contig(self.contig).get_length()
return ((self.length / contig_length) * 100)
def get_percentage_of_contig_length(self):
return self.percentage_of_contig_length
def set_gc_content(self, gc_content):
self.gc_content = gc_content
def get_gc_content(self):
return self.gc_content
def lendev_from_contig(self, a):
"""Indicates how much the gene length deviates from the mean gene length
on the contig, in units of gene length SD (contig)."""
# if this is the only gene on the contig
if self.single_gene == 1:
# no comparisons possible
return np.nan
gene_length_sd_contig = a.get_contig(self.contig).get_gene_lengths_sd()
# if all genes in the contig have the same length
if gene_length_sd_contig == 0:
# no deviation from contig
return np.nan
gene_length_mean_contig = a.get_contig(self.contig).get_gene_lengths_mean()
# deviation in units of 1 SD (contig)
dev_in_sd = abs(gene_length_mean_contig - self.length) / gene_length_sd_contig
# if own length is smaller than gene length mean on contig
if self.length < gene_length_mean_contig:
# return negative deviation
return -dev_in_sd
else:
return dev_in_sd
def lendev_from_overall(self, mean_ref, sd_ref):
"""Indicates how much the gene length deviates from overall mean gene length,
in units of gene length SD (overall)."""
# deviation in units of 1 SD (overall)
dev_in_sd = abs(mean_ref - self.length) / sd_ref
# if own length is smaller than gene length mean
if self.length < mean_ref:
# return negative deviation
return -dev_in_sd
else:
return dev_in_sd
def covdev_from_contig(self, a, pbc_index):
"""Indicates how much the gene cov deviates from the mean gene coverage
on the contig, in units of gene cov SD (contig)."""
# if this is the only gene on the contig
if self.single_gene == 1:
# no comparisons possible
return np.nan
gene_cov_sd_contig = a.get_contig(self.contig).get_gene_coverage_sd(pbc_index)
if gene_cov_sd_contig == 0:
# no deviation from contig
return np.nan
gene_cov_mean_contig = a.get_contig(self.contig).get_gene_coverage_mean(pbc_index)
# deviation in units of 1 SD (contig)
dev_in_sd = abs(gene_cov_mean_contig - self.coverage[pbc_index]) / gene_cov_sd_contig
# if own cov is smaller than gene cov mean on contig
if self.coverage[pbc_index] < gene_cov_mean_contig:
# return negative deviation
return -dev_in_sd
else:
return dev_in_sd
def covdev_from_overall(self, mean_ref, sd_ref, pbc_index):
"""Indicates how much the gene cov deviates from overall mean gene cov,
in units of gene cov SD (overall)."""
# if coverage is uniform (e.g. because of mock coverage)
# there is no SD in coverage; thus deviation in SD can not be computed
if sd_ref == 0:
return np.nan
# deviation in units of 1 SD (overall)
dev_in_sd = abs(mean_ref - self.coverage[pbc_index]) / sd_ref
# if own cov is smaller than gene cov mean
if self.coverage[pbc_index] < mean_ref:
# return negative deviation
return -dev_in_sd
else:
return dev_in_sd
def gcdev_from_contig(self, a):
"""Indicates how much the gene GC deviates from the mean gene GC content
on the contig, in units of gene GC SD (contig)."""
# if this is the only gene on the contig
if self.single_gene == 1:
# no comparisons possible
return np.nan
gene_gc_sd_contig = a.get_contig(self.contig).get_gene_gc_sd()
# if all genes in the contig have the same gc content
if gene_gc_sd_contig == 0:
# no deviation from contig
return np.nan
gene_gc_mean_contig = a.get_contig(self.contig).get_gene_gc_mean()
# deviation in units of 1 SD (contig)
dev_in_sd = abs(gene_gc_mean_contig - self.gc_content) / gene_gc_sd_contig
# if own GC is smaller than gene GC mean on contig
if self.gc_content < gene_gc_mean_contig:
# return negative deviation
return -dev_in_sd
else:
return dev_in_sd
def gcdev_from_overall(self, mean_ref, sd_ref):
"""Indicates how much the gene GC content deviates from overall mean gene GC,
in units of gene GC SD (overall)."""
# deviation in units of 1 SD (overall)
dev_in_sd = abs(mean_ref - self.gc_content) / sd_ref
# if own GC is smaller than gene GC mean
if self.gc_content < mean_ref:
# return negative deviation
return -dev_in_sd
else:
return dev_in_sd
def set_oligofreqs(self, tetranuc_freqs, trinuc_freqs, dinuc_freqs):
self.tetranuc_freqs = tetranuc_freqs
self.trinuc_freqs = trinuc_freqs
self.dinuc_freqs = dinuc_freqs
def get_oligofreqs(self):
return self.tetranuc_freqs, self.trinuc_freqs, self.dinuc_freqs
def set_z_score_vector(self, z_score_vector):
self.z_score_vector = z_score_vector
def get_z_score_vector(self):
return self.z_score_vector