Source code for transposonmapper.processing.read_sgdfeatures

# -*- coding: utf-8 -*-

from transposonmapper.importing import load_sgd_tab
from transposonmapper.utils   import  chromosomename_roman_to_arabic

[docs]def sgd_features(filepath=None): """This function read the file SGD_features.tab and create a dictionary with useful info for processing Parameters ---------- filepath : str, optional filepath of the sgd.tab file , by default None Returns ------- dict A dictionary with the following info: key: 0. Feature name value: 0. feature type (l[1]) 1. feature qualifier (Verified or Dubious) (l[2]) 2. Standard name (l[4]) 3. Aliases (separated by '|') (l[5]) 4. Parent feature name (typically 'chromosome ...') (l[6]) 5. Chromosome (l[8]) 6. start coordinate (starting at 0 for each chromosome) (l[9]) 7. end coordinate (starting at 0 for each chromosome) (l[10]) This file reads the SGD_features.txt file found at http://sgd-archive.yeastgenome.org/curation/chromosomal_feature/ """ if filepath == None: filepath=load_sgd_tab() arabic_to_roman_dict=chromosomename_roman_to_arabic()[0] with open(filepath) as f: lines = f.readlines() feature_list = [] feature_orf_dict = {} feature_ars_dict = {} feature_telomere_dict = {} feature_ltr_dict = {} feature_centromere_dict = {} feature_Xelement_dict = {} feature_intron_dict = {} feature_ncrna_dict = {} feature_ncexon_dict = {} feature_trna_dict = {} feature_snorna_dict = {} feature_teg_dict = {} feature_5p_utrintron_dict = {} feature_mas_dict = {} feature_snrna_dict = {} feature_rrna_dict = {} feature_ets_dict = {} feature_its_dict = {} feature_oor_dict = {} feature_telrna_dict = {} for line in lines: l = line.strip('\n').split('\t') if not l[1] in feature_list: feature_list.append(l[1]) if not l[8].endswith('micron') and not l[8] == '': chromosome = arabic_to_roman_dict.get(int(l[8])) if l[1] == 'ORF': feature_orf_dict[l[3]] = [l[1], l[2], l[4], l[5], l[6], chromosome, l[9],l[10]] elif l[1] == 'ARS': feature_ars_dict[l[3]] = [l[1], l[2], l[4], l[5], l[6], chromosome, l[9],l[10]] elif l[1] == 'telomere': feature_telomere_dict[l[3]] = [l[1], l[2], l[4], l[5], l[6], chromosome, l[9],l[10]] elif l[1] == 'long_terminal_repeat': feature_ltr_dict[l[3]] = [l[1], l[2], l[4], l[5], l[6], chromosome, l[9],l[10]] elif l[1] == 'centromere': feature_centromere_dict[l[3]] = [l[1], l[2], l[4], l[5], l[6], chromosome, l[9],l[10]] elif l[1] == 'X_element': feature_Xelement_dict[l[3]] = [l[1], l[2], l[4], l[5], l[6], chromosome, l[9],l[10]] elif l[1] == 'intron': feature_intron_dict[l[6]] = [l[1], l[2], l[4], l[5], l[6], chromosome, l[9],l[10]] elif l[1] == 'ncRNA_gene': feature_ncrna_dict[l[3]] = [l[1], l[2], l[4], l[5], l[6], chromosome, l[9],l[10]] elif l[1] == 'noncoding_exon': feature_ncexon_dict[l[6]] = [l[1], l[2], l[4], l[5], l[6], chromosome, l[9],l[10]] elif l[1] == 'tRNA_gene': feature_trna_dict[l[3]] = [l[1], l[2], l[4], l[5], l[6], chromosome, l[9],l[10]] elif l[1] == 'snoRNA_gene': feature_snorna_dict[l[3]] = [l[1], l[2], l[4], l[5], l[6], chromosome, l[9],l[10]] elif l[1] == 'transposable_element_gene': feature_teg_dict[l[3]] = [l[1], l[2], l[4], l[5], l[6], chromosome, l[9],l[10]] elif l[1] == 'five_prime_UTR_intron': feature_5p_utrintron_dict[l[6]] = [l[1], l[2], l[4], l[5], l[6], chromosome, l[9],l[10]] elif l[1] == 'matrix_attachment_site': feature_mas_dict[l[3]] = [l[1], l[2], l[4], l[5], l[6], chromosome, l[9],l[10]] elif l[1] == 'snRNA_gene': feature_snrna_dict[l[3]] = [l[1], l[2], l[4], l[5], l[6], chromosome, l[9],l[10]] elif l[1] == 'rRNA_gene': feature_rrna_dict[l[3]] = [l[1], l[2], l[4], l[5], l[6], chromosome, l[9],l[10]] elif l[1] == 'external_transcribed_spacer_region': feature_ets_dict[l[6]] = [l[1], l[2], l[4], l[5], l[6], chromosome, l[9],l[10]] elif l[1] == 'internal_transcribed_spacer_region': feature_its_dict[l[6]] = [l[1], l[2], l[4], l[5], l[6], chromosome, l[9],l[10]] elif l[1] == 'origin_of_replication': feature_oor_dict[l[3]] = [l[1], l[2], l[4], l[5], l[6], chromosome, l[9],l[10]] elif l[1] == 'telomerase_RNA_gene': feature_telrna_dict[l[3]] = [l[1], l[2], l[4], l[5], l[6], chromosome, l[9],l[10]] genomicregions_list = ['ORF', 'ARS', 'Telomere', 'long_terminal_repeat', 'Centromere', 'X_element', 'Intron', 'ncRNA_gene', 'Noncoding_exon', 'tRNA_gene', 'snoRNA_gene', 'transposable_element_gene', 'five_prime_UTR_intron', 'matrix_attachment_site', 'snRNA_gene', 'rRNA_gene', 'external_transcribed_spacer_region', 'internal_transcribed_spacer_region', 'origin_of_replication', 'telomerase_RNA_gene'] return(genomicregions_list, feature_orf_dict, feature_ars_dict, feature_telomere_dict, feature_ltr_dict, feature_centromere_dict, feature_Xelement_dict, feature_intron_dict, feature_ncrna_dict, feature_ncexon_dict, feature_trna_dict, feature_snorna_dict, feature_teg_dict, feature_5p_utrintron_dict, feature_mas_dict, feature_snrna_dict, feature_rrna_dict, feature_ets_dict, feature_its_dict, feature_oor_dict, feature_telrna_dict)