Source code for transposonmapper.properties.list_gene_names


import os
import pkg_resources

[docs]def list_gene_names(gene_information_file=None):
    """Create a list of all known gene names and their aliases as listed on SGD (or as provided as an optional input file)
    Input is a standard file downloaded from https://www.uniprot.org/docs/yeast.
    Output is list of all genes, which also includes all the aliases (if they exists).

    Parameters
    ----------
    gene_information_file : str, optional
        Input is a standard file downloaded from https://www.uniprot.org/docs/yeast, by default None

    Returns
    -------
    list
        Output is list of all genes, which also includes all the aliases (if they exists).
    """   

    if gene_information_file == None:
       
        default_path = pkg_resources.resource_filename("transposonmapper", "data_files/")
    
        gene_information_file = os.path.join(
            default_path, "Yeast_Protein_Names.txt"
        )
        
    gene_name_list = []  # INLCUDES ALL GENE NAMES AND POTENTIAL ALIASES
    gene_oln_list = []  # INCLUDE ONLY THE OLN NAMING CONVENTION
    gene_sgd_list = []  # INCLUDE THE FIRST DESIGNATION NAME THAT IS IN THE INPUT LIST
    gene_counter = 0
    with open(gene_information_file) as f:
        lines = f.readlines()
        for i in range(
            58, len(lines) - 6
        ):  # THE GENES START AT LINE 58 AND STOP 6 LINES BEFORE THE END OF THE FILE.
            n = 0
            l = lines[i]

            extra_columns = l.count(
                ";"
            )  # COUNT HOW MANY TIMES ';' OCCURS IN A LINE. THIS IS NEEDED TO GET THE RIGHT COLUMNS AS SOMETIMES ALIASES OF GENES ARE PRESENTED IN EXTRA COLUMNS
            l_short = " ".join(l.split())
            l_list = l_short.split(" ")

            gene_name_list.append(l_list[0].strip(";"))
            gene_sgd_list.append(l_list[0].strip(";"))

            gene_oln = l_list[1 + extra_columns].strip(";")  # GET THE OLN NAME
            if (
                gene_oln == "GAG" or gene_oln == "POL"
            ):  # CHECK WHETHER THE OLN IS 'GAG' OR 'POL'. IF YES, TAKE THE NEXT COLUMN
                gene_name_list.append(l_list[2 + extra_columns].strip(";"))
                gene_oln_list.append(l_list[2 + extra_columns].strip(";"))
            else:
                gene_name_list.append(gene_oln)
                gene_oln_list.append(gene_oln)

            if (
                l_list[1 + extra_columns] == "GAG" or l_list[1 + extra_columns] == "POL"
            ):  # THESE ARE SEQUENCES THAT SOMETIMES OCCUR WHICH HAVE TO BE IGNORED.
                extra_columns = extra_columns + 1
            if extra_columns > 0:
                for n in range(extra_columns):
                    gene_name = l_list[1 + n].strip(";")
                    if not gene_name == "GAG" and not gene_name == "POL":
                        gene_name_list.append(gene_name)
            gene_counter += 1

    ###SAVING OLN LIST
    #    from datetime import date
    #    current_date = date.today()

    #    oln_saving_file = r"S_Cerevisiae_protein_oln_name_full_genome.txt"
    #    gene_oln_list_sorted = sorted(gene_oln_list)
    #    with open(oln_saving_file, 'w') as f:
    #
    ##        f.write("org=S. Cerevisiae ; type=Genomic ; naming='oln' ; source file='Yeast_Protein_Names.txt' ; creation date=%s using 'gene_names.py'\n" % current_date)
    #        for oln_name in gene_oln_list_sorted:
    #            f.write("%s\n" % oln_name)

    #    sgd_saving_file = r"S_Cerevisiae_protein_designation_name_full_genome.txt"
    #    gene_sgd_list_sorted = sorted(gene_sgd_list)
    #    with open(sgd_saving_file, 'w') as f:
    #
    #        f.write("org=S. Cerevisiae ; type=Genomic ; naming='designation' ; source file='Yeast_Protein_Names.txt' ; creation date=%s using 'gene_names.py'\n" % current_date)
    #        for sgd_name in gene_sgd_list_sorted:
    #            f.write("%s \n" % sgd_name)

    ###
    print("Number of genes found in file = ", gene_counter)
    return gene_name_list
SATAY pipeline at Delft :)

Source code for transposonmapper.properties.list_gene_names