Python command-line program to convert genomic data file

Posted on

Problem

Background:

I have written this code to transforme a .csv file exported from a software called Geneious containing SNPs and concatenate them into a DNA sequence.

So basically take fields from .csv file to make strings.

The code itself is just a bunch of functions that perform small tasks, some functions call others and in the end the result is printed to a file. I used argparse because this is going to be a command line tool, and is useful to have obligatory arguments and default values for the others.

I am inexperienced in coding and have noone to review my code. I feel that needing to call each argument for each function is really awkward.

My questions:

Is this the best structure? Is creating a “chain” of functions like this the Best practice?

Code

import argparse
import collections
import csv


def cleaning(file_as_list, snp, names):
    """From input file get the SNPS."""
    with open(file_as_list, 'r') as input_file:
        reader = csv.reader(input_file)
        file = list(reader)
    have_SNP = [x for x in file if x[snp] == '1']
    for i in range(len(have_SNP)):
        mult_names = have_SNP[i][names].replace(':', ',').replace(', ', ',')
        sep_names = mult_names.split(',')
        only_names = [x for x in sep_names if ' ' not in x]
        have_SNP[i][names] = only_names
    return have_SNP


def reference_dic(file_as_list, snp, names, col_ref, pos):
    """Creates the dict with all positions and reference nucleotides."""
    have_SNP = cleaning(file_as_list, snp, names)
    ref_dic = {}
    for i in have_SNP:
        ref_dic[int(i[pos].replace(',', ''))] = i[col_ref]
    return ref_dic


def pos_list(file_as_list, snp, names, col_ref, pos):
    """Creates a list with all the ehxisting positions in reference."""
    ref_dic = reference_dic(file_as_list, snp, names, col_ref, pos)
    list_pos = []
    for key in ref_dic:
        list_pos.append(key)
    sorted_pos_lis = sorted(list_pos)
    return sorted_pos_lis


def genomes_list(file_as_list, snp, names, col_ref, pos):
    """Identifies the genomes present in the input file."""
    have_SNP = cleaning(file_as_list, snp, names)
    genomes_dic = {}
    for i in have_SNP:
        for j in i[names]:
            genomes_dic[j] = ""
    genomes_list = []
    for key in genomes_dic:
        genomes_list.append(key)
    return genomes_list


def identify_genomes(file_as_list, snp, names, col_ref, pos, col_genome):
    """Creates a list of tuples with genome name and respesctive SNPs."""
    have_SNP = cleaning(file_as_list, snp, names)
    genomes = genomes_list(file_as_list, snp, names, col_ref, pos)
    entrys_per_genome = []
    pos_genomes_in_dict = []
    for i in genomes:
        sub_tup = ()
        sub_list = []
        sub_dict = {}
        for j in have_SNP:
            if i in j[names]:
                sub_sub_list = [int(j[pos].replace(',', '')), j[col_genome]]
                sub_list.append(sub_sub_list)
                sub_dict[int(j[pos].replace(',', ''))] = j[col_genome]
        sub_tup = (i, sub_list)
        sub_dic_tup = (i, sub_dict)
        entrys_per_genome.append(sub_tup)
        pos_genomes_in_dict.append(sub_dic_tup)
    return entrys_per_genome, pos_genomes_in_dict


def remove_dupli_pos(file_as_list, snp, names, col_ref, pos, col_genome):
    """Creates a list without SNPs that appear 2 times for one genome."""
    entrys_per_genome = identify_genomes(file_as_list, snp, names, col_ref,
                                        pos, col_genome)[0]
    all_genomes_pos = []
    for i in entrys_per_genome:
        genome_pos = []
        for j in i[1]:
            genome_pos.append(j[0])
        all_genomes_pos.append(genome_pos)
    list_dup_pos = []
    for i in all_genomes_pos:
        duplicated = [k for k, v in collections.Counter(i).items() if v > 1]
        list_dup_pos.extend(duplicated)
    no_dup_list_dup_pos = set(list_dup_pos)
    all_positions = pos_list(file_as_list, snp, names, col_ref, pos)
    pos_no_dup = [x for x in all_positions if x not in no_dup_list_dup_pos]
    return pos_no_dup


def get_ref(file_as_list, snp, names, col_ref, pos, col_genome):
    """Creates the reference sequence based on all SNPs."""
    ref_dic = reference_dic(file_as_list, snp, names, col_ref, pos)
    pos_no_dup = remove_dupli_pos(file_as_list, snp, names, col_ref,
                                  pos, col_genome)
    reference_snps_list = ""
    for i in pos_no_dup:
        reference_snps_list += str(ref_dic[i])
    return reference_snps_list


def get_genomes(file_as_list, snp, names, col_ref, pos, col_genome):
    """Uses the SNPs for each genome and 'N's to build each genome sequence."""
    ref_dic = reference_dic(file_as_list, snp, names, col_ref, pos)
    pos_no_dup = remove_dupli_pos(file_as_list, snp, names, col_ref, pos,
                                  col_genome)
    genomes_pos = identify_genomes(file_as_list, snp, names, col_ref, pos,
                                  col_genome)[1]
    genomes = []
    for i in genomes_pos:
        dic_of_genome = i[1]
        this_genome = ""
        for j in pos_no_dup:
            if j in dic_of_genome.keys():
                this_genome += str(dic_of_genome[j])
            elif j in ref_dic:
                this_genome += 'N'
            else:
                print("ERROR!!!!")
                break
        genomes.append(">{0}".format(i[0]))
        genomes.append(this_genome)
    return genomes


def main(file_as_list, snp, names, col_ref, pos, col_genome):
    """Creates 'files.fasta' with the ref and genomes in fasta format."""
    ref_genome = get_ref(file_as_list, snp, names, col_ref, pos, col_genome)
    genomes = get_genomes(file_as_list, snp, names, col_ref, pos, col_genome)
    with open("files.fasta", "w") as out_file:
        out_file.write(">reference_sequencen")
        out_file.write("{0}n".format(ref_genome))
        for i in genomes:
            out_file.write("{0}n".format(i))


if __name__ == '__main__':
    parser = argparse.ArgumentParser()
    parser.add_argument("input",
                        help="name of the input file")
    parser.add_argument("-r", "--col_ref_genome_nuc", default=2,
                        help="""number of the column with the reference genome
                        nucleotides""")
    parser.add_argument("-g", "--col_genomes_nuc", default=8,
                        help="""number of the column with the genomes
                        nucleotides""")
    parser.add_argument("-p", "--position", default=3,
                        help="""number of the column with the position in the
                        genome""")
    parser.add_argument("-n", "--genome_names", default=10,
                        help="number of the column with the genomes names")
    parser.add_argument("-s", "--is_snp", default=7,
                        help="number of the column with lenght")
    args = parser.parse_args()
    print("""Columns:n[Reference genome:{0}]n[Genomes:{1}]
[Position of the SNP:{2}]n[Genomes name:{3}]
[Is SNP:{4}]""" .format(args.col_ref_genome_nuc, args.col_genomes_nuc,
                        args.position, args.genome_names, args.is_snp))
    col_ref = int(args.col_ref_genome_nuc) - 1
    col_genome = int(args.col_genomes_nuc) - 1
    pos = int(args.position) - 1
    names = int(args.genome_names) - 1
    snp = int(args.is_snp) - 1
    file_as_list = str(args.input)

    print("nProcessing...")
    main(file_as_list, snp, names, col_ref, pos, col_genome)
    print("nJob Done. Output written as <files.fasta>")

Solution

This sounds like a good use case to use a class where the arguments you currently pass through the chain of functions would be class attributes, for instance, if we would have a single Converter class, we may have it initialized this way:

class Converter:
    def __init__(self, filename, snp, names, col_ref, pos, col_genome):
        self.filename = filename
        self.snp = snp
        self.names = names
        self.col_ref = col_ref
        self.pos = pos
        self.col_genome = col_genome

Then, your functions will become instance methods where you’ll access the instance attributes via self.<attribute> instead of taking an argument.

Think of a class as a way to group related things logically, providing a shared access to the common variables and methods.

There are also some other things to improve:

  • instead of converting your arguments to int, you can define them with type=int
  • you can make use of dictionary and list comprehensions in multiple places
  • you can make use of str.join() – for example, when defining reference_snps_list:

    reference_snps_list = "".join(str(ref_dic[i]) for i in pos_no_dup)
    
  • you can use the special argparse.FileType for the input file argument

FYI, since this is a controversial topic in general:

Leave a Reply

Your email address will not be published. Required fields are marked *