MiMiC2/MiMiC2-BUTLER.py at main · ClavelLab/MiMiC2 · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
#!/usr/bin/env python3.11
import glob
import pandas as pd
import operator
import collections
import numpy as np
import sys
import scipy.stats as stats
import itertools
import subprocess
import random
from tqdm import tqdm
import pandas
from statistics import mean
import os
import argparse


location_of_file = os.path.dirname(os.path.realpath(__file__))


location_of_call = os.getcwd()

version = '2024-08-08'


# User input
parser = argparse.ArgumentParser(description='MiMiC2-BUTLER v' + version)
# Options for Sample data
parser.add_argument('-s','--samples', metavar='{INPUT}', required=True, help='Provide a folder which contains all of your Pfam annotated genomes/metagenomes.')

# Option for Function database
parser.add_argument('-p','--pfam', metavar='{INPUT}',required=True, help='Pfam file e.g. Pfam-A.clans.csv, provided for Pfam v32 in `datasets/core/`')

# State tool used
parser.add_argument('-t','--tool', metavar='{TEXT}', help='State the tool used to annotate the geomes against the Pfam database: `hmmsearch` or `hmmscan`')

# Options for output
parser.add_argument('-o','--output', metavar='{OUTPUT}', required=True, help='Prefix for all the Pfam-profile file e.g. HuSynCom.')

# Remove file ending
parser.add_argument('-e','--extension', metavar='{TEXT}', default='.hmmer', required=True, help='Provide the extension for your Pfam annotation files.')

# Remove file ending
parser.add_argument('-m','--merge', default=False, action=argparse.BooleanOptionalAction, help='Option to merge premade vector files within a folder, define -e as "profile.txt""')


args, unknown = parser.parse_known_args()


print (': Reading in the users options.')

print (args.merge)

pfam_file = args.pfam

genome_folder = args.samples

file_ending = args.extension

output_file = args.output + '-profile.txt'


pfams = []

for line in tqdm(open(pfam_file,'r')):
    timber = line.split('\t')
    pfams.append(timber[0])

print (': The number of pfams studied are: ' + str(len(pfams)))


print (': Users options accepted.')

counting = 0
for cfile in tqdm(glob.glob(genome_folder + '/*' + file_ending)):
	counting +=1

print (': Files meeting criteria in folder:' + str(counting))


if args.merge == True:
	print (':: Merging your profiles.')
	combined = ''
	file_num = 0
	for cfile in tqdm(glob.glob(genome_folder + '/*' + file_ending)):
		df = pd.read_csv(cfile,sep='\t')
		file_num +=1
		if file_num == 1:
			combined = df
		else:
			combined = pd.merge(combined, df, on='PfamID')
	combined.to_csv(location_of_call + '/' + output_file, sep='\t', index=False, header=True)
	sys.exit()


if args.tool == 'hmmscan':
	print (':: Handling your files and cataloging their Pfam presence/absence.')

	samples_data = {}

	for cfile in tqdm(glob.glob(genome_folder + '/*' + file_ending)):
	    #print (cfile)
	    indiv_pfam = []
	    for line in open(cfile):
	        if line.startswith('#'):
	            lolp = 0
	        else:

	            pfam = list(filter(None,line.split(' ')))[1].split('.')[0]
	            if pfam in pfams:
	                indiv_pfam.append(pfam)
	            else:
	                print ('ERROR: PFAM not found in database; ' + str(pfam))
	                print ('HINT: Are you using the same Pfam versions for each step in the analysis?')
	                print ('ACTION: Forced exiting now.')
	                sys.exit()

	    samples_data[cfile.split('/')[-1:][0].replace(file_ending,'')] = list(dict.fromkeys(indiv_pfam))


	outputting = open(location_of_call + '/' + output_file,'w')


	pfam_lines = {}

	header = 'PfamID'


	for pfam in pfams: # Provide an entry for every pfam so each will be accounted for
	    pfam_lines[pfam] = pfam

	for sample, data in samples_data.items():
	    theader = header + '\t' + sample # add sample name to header
	    header = theader

	    tpfam_lines = {}
	    for pfam, existingline in pfam_lines.items(): # Loop over every pfam so all are accounted for
	        if pfam in data: # if the pfam is in the sample add a 1
	            tpfam_lines[pfam] = existingline + '\t1'
	        else: # if the pfam is not in the sample add a 0
	            tpfam_lines[pfam] = existingline + '\t0'
	    pfam_lines = tpfam_lines

	outputting.write(header + '\n')

	for k, v in pfam_lines.items():
	    outputting.write(v + '\n')

	outputting.close()


elif args.tool == 'hmmsearch':
	print (':: Handling your files and cataloging their Pfam presence/absence.')

	samples_data = {}

	for cfile in tqdm(glob.glob(genome_folder + '/*' + file_ending)):
	    #print (cfile)
	    indiv_pfam = []
	    for line in open(cfile):
	        if line.startswith('#'):
	            lolp = 0
	        else:

	            pfam = list(filter(None,line.split(' ')))[3].split('.')[0]
	            if pfam in pfams:
	                indiv_pfam.append(pfam)
	            else:
	                print ('ERROR: PFAM not found in database; ' + str(pfam))
	                print ('HINT: Are you using the same Pfam versions for each step in the analysis?')
	                print ('ACTION: Forced exiting now.')
	                sys.exit()

	    samples_data[cfile.split('/')[-1:][0].replace(file_ending,'')] = list(dict.fromkeys(indiv_pfam))


	outputting = open(location_of_call + '/' + output_file,'w')


	pfam_lines = {}

	header = 'PfamID'


	for pfam in pfams: # Provide an entry for every pfam so each will be accounted for
	    pfam_lines[pfam] = pfam

	for sample, data in samples_data.items():
	    theader = header + '\t' + sample # add sample name to header
	    header = theader

	    tpfam_lines = {}
	    for pfam, existingline in pfam_lines.items(): # Loop over every pfam so all are accounted for
	        if pfam in data: # if the pfam is in the sample add a 1
	            tpfam_lines[pfam] = existingline + '\t1'
	        else: # if the pfam is not in the sample add a 0
	            tpfam_lines[pfam] = existingline + '\t0'
	    pfam_lines = tpfam_lines

	outputting.write(header + '\n')

	for k, v in pfam_lines.items():
	    outputting.write(v + '\n')

	outputting.close()


print ('::: Your Pfam profile has been prepared for you.')