Problem
Background:
I have written this code to transforme a .csv file exported from a software called Geneious containing SNPs and concatenate them into a DNA sequence.
So basically take fields from .csv file to make strings.
The code itself is just a bunch of functions that perform small tasks, some functions call others and in the end the result is printed to a file. I used argparse because this is going to be a command line tool, and is useful to have obligatory arguments and default values for the others.
I am inexperienced in coding and have noone to review my code. I feel that needing to call each argument for each function is really awkward.
My questions:
Is this the best structure? Is creating a “chain” of functions like this the Best practice?
Code
import argparse
import collections
import csv
def cleaning(file_as_list, snp, names):
"""From input file get the SNPS."""
with open(file_as_list, 'r') as input_file:
reader = csv.reader(input_file)
file = list(reader)
have_SNP = [x for x in file if x[snp] == '1']
for i in range(len(have_SNP)):
mult_names = have_SNP[i][names].replace(':', ',').replace(', ', ',')
sep_names = mult_names.split(',')
only_names = [x for x in sep_names if ' ' not in x]
have_SNP[i][names] = only_names
return have_SNP
def reference_dic(file_as_list, snp, names, col_ref, pos):
"""Creates the dict with all positions and reference nucleotides."""
have_SNP = cleaning(file_as_list, snp, names)
ref_dic = {}
for i in have_SNP:
ref_dic[int(i[pos].replace(',', ''))] = i[col_ref]
return ref_dic
def pos_list(file_as_list, snp, names, col_ref, pos):
"""Creates a list with all the ehxisting positions in reference."""
ref_dic = reference_dic(file_as_list, snp, names, col_ref, pos)
list_pos = []
for key in ref_dic:
list_pos.append(key)
sorted_pos_lis = sorted(list_pos)
return sorted_pos_lis
def genomes_list(file_as_list, snp, names, col_ref, pos):
"""Identifies the genomes present in the input file."""
have_SNP = cleaning(file_as_list, snp, names)
genomes_dic = {}
for i in have_SNP:
for j in i[names]:
genomes_dic[j] = ""
genomes_list = []
for key in genomes_dic:
genomes_list.append(key)
return genomes_list
def identify_genomes(file_as_list, snp, names, col_ref, pos, col_genome):
"""Creates a list of tuples with genome name and respesctive SNPs."""
have_SNP = cleaning(file_as_list, snp, names)
genomes = genomes_list(file_as_list, snp, names, col_ref, pos)
entrys_per_genome = []
pos_genomes_in_dict = []
for i in genomes:
sub_tup = ()
sub_list = []
sub_dict = {}
for j in have_SNP:
if i in j[names]:
sub_sub_list = [int(j[pos].replace(',', '')), j[col_genome]]
sub_list.append(sub_sub_list)
sub_dict[int(j[pos].replace(',', ''))] = j[col_genome]
sub_tup = (i, sub_list)
sub_dic_tup = (i, sub_dict)
entrys_per_genome.append(sub_tup)
pos_genomes_in_dict.append(sub_dic_tup)
return entrys_per_genome, pos_genomes_in_dict
def remove_dupli_pos(file_as_list, snp, names, col_ref, pos, col_genome):
"""Creates a list without SNPs that appear 2 times for one genome."""
entrys_per_genome = identify_genomes(file_as_list, snp, names, col_ref,
pos, col_genome)[0]
all_genomes_pos = []
for i in entrys_per_genome:
genome_pos = []
for j in i[1]:
genome_pos.append(j[0])
all_genomes_pos.append(genome_pos)
list_dup_pos = []
for i in all_genomes_pos:
duplicated = [k for k, v in collections.Counter(i).items() if v > 1]
list_dup_pos.extend(duplicated)
no_dup_list_dup_pos = set(list_dup_pos)
all_positions = pos_list(file_as_list, snp, names, col_ref, pos)
pos_no_dup = [x for x in all_positions if x not in no_dup_list_dup_pos]
return pos_no_dup
def get_ref(file_as_list, snp, names, col_ref, pos, col_genome):
"""Creates the reference sequence based on all SNPs."""
ref_dic = reference_dic(file_as_list, snp, names, col_ref, pos)
pos_no_dup = remove_dupli_pos(file_as_list, snp, names, col_ref,
pos, col_genome)
reference_snps_list = ""
for i in pos_no_dup:
reference_snps_list += str(ref_dic[i])
return reference_snps_list
def get_genomes(file_as_list, snp, names, col_ref, pos, col_genome):
"""Uses the SNPs for each genome and 'N's to build each genome sequence."""
ref_dic = reference_dic(file_as_list, snp, names, col_ref, pos)
pos_no_dup = remove_dupli_pos(file_as_list, snp, names, col_ref, pos,
col_genome)
genomes_pos = identify_genomes(file_as_list, snp, names, col_ref, pos,
col_genome)[1]
genomes = []
for i in genomes_pos:
dic_of_genome = i[1]
this_genome = ""
for j in pos_no_dup:
if j in dic_of_genome.keys():
this_genome += str(dic_of_genome[j])
elif j in ref_dic:
this_genome += 'N'
else:
print("ERROR!!!!")
break
genomes.append(">{0}".format(i[0]))
genomes.append(this_genome)
return genomes
def main(file_as_list, snp, names, col_ref, pos, col_genome):
"""Creates 'files.fasta' with the ref and genomes in fasta format."""
ref_genome = get_ref(file_as_list, snp, names, col_ref, pos, col_genome)
genomes = get_genomes(file_as_list, snp, names, col_ref, pos, col_genome)
with open("files.fasta", "w") as out_file:
out_file.write(">reference_sequencen")
out_file.write("{0}n".format(ref_genome))
for i in genomes:
out_file.write("{0}n".format(i))
if __name__ == '__main__':
parser = argparse.ArgumentParser()
parser.add_argument("input",
help="name of the input file")
parser.add_argument("-r", "--col_ref_genome_nuc", default=2,
help="""number of the column with the reference genome
nucleotides""")
parser.add_argument("-g", "--col_genomes_nuc", default=8,
help="""number of the column with the genomes
nucleotides""")
parser.add_argument("-p", "--position", default=3,
help="""number of the column with the position in the
genome""")
parser.add_argument("-n", "--genome_names", default=10,
help="number of the column with the genomes names")
parser.add_argument("-s", "--is_snp", default=7,
help="number of the column with lenght")
args = parser.parse_args()
print("""Columns:n[Reference genome:{0}]n[Genomes:{1}]
[Position of the SNP:{2}]n[Genomes name:{3}]
[Is SNP:{4}]""" .format(args.col_ref_genome_nuc, args.col_genomes_nuc,
args.position, args.genome_names, args.is_snp))
col_ref = int(args.col_ref_genome_nuc) - 1
col_genome = int(args.col_genomes_nuc) - 1
pos = int(args.position) - 1
names = int(args.genome_names) - 1
snp = int(args.is_snp) - 1
file_as_list = str(args.input)
print("nProcessing...")
main(file_as_list, snp, names, col_ref, pos, col_genome)
print("nJob Done. Output written as <files.fasta>")
Solution
This sounds like a good use case to use a class where the arguments you currently pass through the chain of functions would be class attributes, for instance, if we would have a single Converter
class, we may have it initialized this way:
class Converter:
def __init__(self, filename, snp, names, col_ref, pos, col_genome):
self.filename = filename
self.snp = snp
self.names = names
self.col_ref = col_ref
self.pos = pos
self.col_genome = col_genome
Then, your functions will become instance methods where you’ll access the instance attributes via self.<attribute>
instead of taking an argument.
Think of a class as a way to group related things logically, providing a shared access to the common variables and methods.
There are also some other things to improve:
- instead of converting your arguments to
int
, you can define them withtype=int
- you can make use of dictionary and list comprehensions in multiple places
-
you can make use of
str.join()
– for example, when definingreference_snps_list
:reference_snps_list = "".join(str(ref_dic[i]) for i in pos_no_dup)
- you can use the special
argparse.FileType
for the input file argument
FYI, since this is a controversial topic in general: