import numpy as np
import timeit
from transposonmapper.mapping.find_chromosome_reads import find_chromosome_reads
from transposonmapper.mapping.correct_read_position import correct_read_position
from transposonmapper.properties import (
get_chromosome_names,
get_sequence_length,
get_chromosome_reads,
)
[docs]def get_reads(bam):
"""This function retrieves all reads within a specified genomic region.
Parameters
----------
bam : The output for the function pysam.AlignmentFile(bamfile, "rb")
Returns
-------
numpy.array
reads per genomic region
numpy.array
Array of three columns where the 2nd one indicated the start position where there was a transposon
numpy.array
A copy from the 2nd output
"""
# Get chromosome properties
ref_tid = get_chromosome_names(bam)
ref_names = list(ref_tid.keys())
chr_lengths, _ = get_sequence_length(bam)
chr_mapped_reads = get_chromosome_reads(bam)
### GET ALL READS WITHIN A SPECIFIED GENOMIC REGION
tnnumber_dict = {}
unique_insertions = 0
for chr_num in ref_names:
timer_start = timeit.default_timer()
print("Getting reads for chromosome %s ..." % chr_num)
chromosome = bam.fetch(chr_num, 0, chr_lengths[chr_num], until_eof=True)
N_reads = chr_mapped_reads[chr_num][2]
[flags, start_position, readlength] = find_chromosome_reads(chromosome, N_reads)
start_position_corrected, flags_corrected = correct_read_position(
flags, start_position, readlength
)
# CREATE ARRAY OF START POSITION AND FLAGS OF ALL READS IN GENOME
ref_tid_kk = int(ref_tid[chr_num] + 1)
if unique_insertions == 0:
tncoordinates_array = np.array([])
unique_insertions_per_read = 0 # Number of unique reads per insertion
num_transposons = 1 # Number of unique reads in current chromosome (Number of transposons in current chromosome)
for ii in range(1, len(start_position_corrected)):
if (
abs(start_position_corrected[ii] - start_position_corrected[ii - 1])
<= 2
and flags_corrected[ii] == flags_corrected[ii - 1]
): # If two subsequent reads are within two basepairs and have the same orientation, add them together.
unique_insertions_per_read += 1
else:
avg_start_pos = abs(
round(
np.mean(
start_position_corrected[
ii - unique_insertions_per_read - 1 : ii
]
)
)
)
if tncoordinates_array.size == 0: # include first read
tncoordinates_array = np.array(
[ref_tid_kk, int(avg_start_pos), int(flags_corrected[ii - 1])]
)
readnumb_list = [unique_insertions_per_read + 1]
else:
tncoordinates_array = np.vstack(
(
tncoordinates_array,
[
ref_tid_kk,
int(avg_start_pos),
int(flags_corrected[ii - 1]),
],
)
)
readnumb_list.append(unique_insertions_per_read + 1)
unique_insertions_per_read = 0
num_transposons += 1
unique_insertions += 1
if ii == len(start_position_corrected) - 1: # include last read
avg_start_pos = abs(
round(
np.mean(
start_position_corrected[
ii - unique_insertions_per_read - 1 : ii
]
)
)
)
tncoordinates_array = np.vstack(
(
tncoordinates_array,
[ref_tid_kk, int(avg_start_pos), int(flags_corrected[ii - 1])],
)
)
readnumb_list.append(unique_insertions_per_read + 1)
tnnumber_dict[chromosome] = num_transposons
timer_end = timeit.default_timer()
print(
"Chromosome %s completed in %.3f seconds"
% (chromosome, (timer_end - timer_start))
)
print("")
readnumb_array = np.array(readnumb_list)
tncoordinatescopy_array = np.array(tncoordinates_array, copy=True)
return readnumb_array, tncoordinates_array, tncoordinatescopy_array