import os
import pkg_resources
[docs]def list_gene_names(gene_information_file=None):
"""Create a list of all known gene names and their aliases as listed on SGD (or as provided as an optional input file)
Input is a standard file downloaded from https://www.uniprot.org/docs/yeast.
Output is list of all genes, which also includes all the aliases (if they exists).
Parameters
----------
gene_information_file : str, optional
Input is a standard file downloaded from https://www.uniprot.org/docs/yeast, by default None
Returns
-------
list
Output is list of all genes, which also includes all the aliases (if they exists).
"""
if gene_information_file == None:
default_path = pkg_resources.resource_filename("transposonmapper", "data_files/")
gene_information_file = os.path.join(
default_path, "Yeast_Protein_Names.txt"
)
gene_name_list = [] # INLCUDES ALL GENE NAMES AND POTENTIAL ALIASES
gene_oln_list = [] # INCLUDE ONLY THE OLN NAMING CONVENTION
gene_sgd_list = [] # INCLUDE THE FIRST DESIGNATION NAME THAT IS IN THE INPUT LIST
gene_counter = 0
with open(gene_information_file) as f:
lines = f.readlines()
for i in range(
58, len(lines) - 6
): # THE GENES START AT LINE 58 AND STOP 6 LINES BEFORE THE END OF THE FILE.
n = 0
l = lines[i]
extra_columns = l.count(
";"
) # COUNT HOW MANY TIMES ';' OCCURS IN A LINE. THIS IS NEEDED TO GET THE RIGHT COLUMNS AS SOMETIMES ALIASES OF GENES ARE PRESENTED IN EXTRA COLUMNS
l_short = " ".join(l.split())
l_list = l_short.split(" ")
gene_name_list.append(l_list[0].strip(";"))
gene_sgd_list.append(l_list[0].strip(";"))
gene_oln = l_list[1 + extra_columns].strip(";") # GET THE OLN NAME
if (
gene_oln == "GAG" or gene_oln == "POL"
): # CHECK WHETHER THE OLN IS 'GAG' OR 'POL'. IF YES, TAKE THE NEXT COLUMN
gene_name_list.append(l_list[2 + extra_columns].strip(";"))
gene_oln_list.append(l_list[2 + extra_columns].strip(";"))
else:
gene_name_list.append(gene_oln)
gene_oln_list.append(gene_oln)
if (
l_list[1 + extra_columns] == "GAG" or l_list[1 + extra_columns] == "POL"
): # THESE ARE SEQUENCES THAT SOMETIMES OCCUR WHICH HAVE TO BE IGNORED.
extra_columns = extra_columns + 1
if extra_columns > 0:
for n in range(extra_columns):
gene_name = l_list[1 + n].strip(";")
if not gene_name == "GAG" and not gene_name == "POL":
gene_name_list.append(gene_name)
gene_counter += 1
###SAVING OLN LIST
# from datetime import date
# current_date = date.today()
# oln_saving_file = r"S_Cerevisiae_protein_oln_name_full_genome.txt"
# gene_oln_list_sorted = sorted(gene_oln_list)
# with open(oln_saving_file, 'w') as f:
#
## f.write("org=S. Cerevisiae ; type=Genomic ; naming='oln' ; source file='Yeast_Protein_Names.txt' ; creation date=%s using 'gene_names.py'\n" % current_date)
# for oln_name in gene_oln_list_sorted:
# f.write("%s\n" % oln_name)
# sgd_saving_file = r"S_Cerevisiae_protein_designation_name_full_genome.txt"
# gene_sgd_list_sorted = sorted(gene_sgd_list)
# with open(sgd_saving_file, 'w') as f:
#
# f.write("org=S. Cerevisiae ; type=Genomic ; naming='designation' ; source file='Yeast_Protein_Names.txt' ; creation date=%s using 'gene_names.py'\n" % current_date)
# for sgd_name in gene_sgd_list_sorted:
# f.write("%s \n" % sgd_name)
###
print("Number of genes found in file = ", gene_counter)
return gene_name_list