bioinformatics.py 5.7 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165
  1. import numpy as np
  2. from ctypes import *
  3. import matplotlib.pyplot as plt
  4. import pandas as pd
  5. from multiprocessing import Pool, Process, freeze_support
  6. from difflib import SequenceMatcher
  7. import itertools
  8. from functools import partial
  9. #test seq about 500 nucleotides from e-coli genome
  10. seq = """atcaatgatcaacgtaagcttctaagcatgatcaaggtgctcacacagtttatccacaac
  11. ctgagtggatgacatcaagataggtcgttgtatctccttcctctcgtactctcatgacca
  12. cggaaagatgatcaagagaggatgatttcttggccatatcgcaatgaatacttgtgactt
  13. gtgcttccaattgacatcttcagcgccatattgcgctggccaaggtgacggagcgggatt
  14. acgaaagcatgatcatggctgttgttctgtttatcttgttttgactgagacttgttagga
  15. tagacggtttttcatcactgactagccaaagccttactctgcctgacatcgaccgtaaat
  16. tgataatgaatttacatgcttccgcgacgatttacctcttgatcatcgatccgattgaag
  17. atcttcaattgttaattctcttgcctcgactcatagccatgatgagctcttgatcatgtt
  18. tccttaaccctctattttttacggaagaatgatcaagctgctgctcttgatcatcgtttc"""
  19. #human_dna = pd.read_table("input/dna/human.txt")
  20. base_pairs = {'G': 'C', 'C': 'G', 'T': 'A', 'A': 'T'}
  21. #refactor this to return starting locations in genome as well
  22. def find_kmers(dna, filter_size: int):
  23. dna_len = len(dna)
  24. if(filter_size > dna_len): raise Exception("Filter size should not be larger than DNA Length")
  25. print(f'sequencing dna of length {dna_len}')
  26. seq_list = []
  27. kmer_list = dict()
  28. for i in range(0, (dna_len-filter_size)):
  29. segment = dna[i:i+filter_size]
  30. if(seq_list.count(segment) > 0):
  31. if(kmer_list.get(segment)):
  32. kmer_list[segment] = kmer_list[segment] + 1
  33. #once it appears a second time we add it to the kmer list with a count of 2
  34. else:
  35. kmer_list[segment] = 2
  36. #since we havent sequenced this yet we will add it to the list
  37. else:
  38. seq_list.append(segment)
  39. #kind of shit until i feel like making a temporary dictionary to put sequences that only appear once
  40. for key in kmer_list:
  41. if(kmer_list[key] < 2):
  42. del kmer_list[key]
  43. return kmer_list
  44. #find kmers with a variability of max_diff SNPs
  45. def find_kmers_with_differences(dna, filter_size: int, max_diff: int):
  46. if(max_diff > filter_size): raise Exception("Max difference should not be larger than filter size")
  47. dna_len = len(dna)
  48. if(filter_size > dna_len): raise Exception("Filter size should not be larger than DNA Length")
  49. print(f'sequencing dna of length {dna_len}')
  50. leniancy_ratio = float(max_diff/filter_size)
  51. seq_list = []
  52. kmer_list = dict()
  53. for i in range(0, (dna_len-filter_size)):
  54. segment = dna[i:i+filter_size]
  55. if(seq_list.count(segment) > 0):
  56. if(kmer_list.get(segment)):
  57. kmer_list[segment] = kmer_list[segment] + 1
  58. #once it appears a second time we add it to the kmer list with a count of 2
  59. else:
  60. kmer_list[segment] = 2
  61. #since we havent sequenced this yet we will add it to the list
  62. else:
  63. if(len(seq_list) > 0):
  64. with Pool(4) as p:
  65. pOutput = p.map(partial(check_for_discreprency, comparison=segment), seq_list)
  66. print(pOutput)
  67. seq_list.append(segment)
  68. #kind of shit until i feel like making a temporary dictionary to put sequences that only appear once
  69. for key in kmer_list:
  70. if(kmer_list[key] < 2):
  71. del kmer_list[key]
  72. return kmer_list
  73. def check_for_discreprency(segment, comparison):
  74. print(f'comparing {segment} and {comparison}')
  75. match = SequenceMatcher(None, segment, comparison).ratio()
  76. print(match)
  77. return match
  78. def find_specific_kmer_with_differences(dna, kmer, max_diff: int):
  79. dna_len = len(dna)
  80. if(filter_size > dna_len): raise Exception("Filter size should not be larger than DNA Length")
  81. print(f'sequencing dna of length {dna_len}')
  82. seq_list = []
  83. kmer_list = dict()
  84. for i in range(0, (dna_len-filter_size)):
  85. segment = dna[i:i+filter_size]
  86. if(seq_list.count(segment) > 0):
  87. if(kmer_list.get(segment)):
  88. kmer_list[segment] = kmer_list[segment] + 1
  89. #once it appears a second time we add it to the kmer list with a count of 2
  90. else:
  91. kmer_list[segment] = 2
  92. #since we havent sequenced this yet we will add it to the list
  93. else:
  94. seq_list.append(segment)
  95. #kind of shit until i feel like making a temporary dictionary to put sequences that only appear once
  96. for key in kmer_list:
  97. if(kmer_list[key] < 2):
  98. del kmer_list[key]
  99. return kmer_list
  100. def compliment(kmer):
  101. compliment = ""
  102. for i in range(0, len(kmer)):
  103. compliment += base_pairs[kmer]
  104. return compliment
  105. def reverse_compliment(kmer):
  106. #since we bind with the reverse
  107. reverse = kmer[::-1]
  108. compliment = ""
  109. for i in range(0, len(reverse)):
  110. compliment += base_pairs[reverse[i]]
  111. return compliment
  112. def skew(genome):
  113. x = np.arange(0, len(genome), 1)
  114. skewData = np.empty(shape=len(genome) + 1, dtype=int)
  115. skewData[0] = 0
  116. print(len(skewData))
  117. for i in range(1, len(genome) + 1):
  118. if(genome[i-1] == 'G'):
  119. skewData[i] = (skewData[i-1] + 1)
  120. elif(genome[i-1] == 'C'):
  121. skewData[i] = (skewData[i-1] - 1)
  122. else:
  123. skewData[i] = skewData[i-1]
  124. return skewData
  125. kmer_dict = dict()
  126. def main():
  127. #most DnaA boxes appear in 3-9mers
  128. kmer_dict = find_kmers_with_differences(seq.upper().replace('\n', ''), 9, 4)
  129. print(kmer_dict)
  130. skewData = skew(seq.upper())
  131. plt.plot(skewData)
  132. plt.xlabel('nucleotide position')
  133. plt.ylabel('skew')
  134. plt.show()
  135. if __name__ == '__main__':
  136. freeze_support()
  137. main()