From 0c814fe1524efc71704b30928c4609447b47ccbf Mon Sep 17 00:00:00 2001 From: Smetanin Aleksandr Date: Mon, 20 Jul 2020 13:26:19 +0300 Subject: [PATCH 1/6] pass alignment to check function --- src/alignment_processor.py | 6 +++--- src/junction_comparator.py | 44 ++++++++++++++++++++++++++++++-------- src/long_read_assigner.py | 8 ++++--- src/long_read_profiles.py | 2 +- src/read_mapper.py | 2 +- 5 files changed, 45 insertions(+), 17 deletions(-) diff --git a/src/alignment_processor.py b/src/alignment_processor.py index bbb8e23c..09f17a15 100644 --- a/src/alignment_processor.py +++ b/src/alignment_processor.py @@ -109,7 +109,8 @@ def process_single_file(self, bam): exon_profile = self.exon_profile_constructor.construct_exon_profile(sorted_blocks) split_exon_profile = self.split_exon_profile_constructor.construct_profile(sorted_blocks) combined_profile = CombinedReadProfiles(intron_profile, exon_profile, split_exon_profile, - polya_pos=polya_pos, polyt_pos=polyt_pos, cage_hits=cage_hits) + polya_pos=polya_pos, polyt_pos=polyt_pos, + cage_hits=cage_hits, alignment=alignment) read_assignment = self.assigner.assign_to_isoform(read_id, combined_profile) read_assignment.polyA_found = (polya_pos != -1 or polyt_pos != -1) @@ -132,8 +133,7 @@ def count_indel_stats(self, alignment): indel_events = [1, 2] indel_count = 0 intron_cigar_positions = [] - for i in range(cigar_event_count): - cigar = alignment.cigartuples[i] + for i, cigar in enumerate(alignment.cigartuples): if cigar[0] in indel_events: indel_count += 1 elif cigar[0] == 3: diff --git a/src/junction_comparator.py b/src/junction_comparator.py index 1015d893..053960ed 100644 --- a/src/junction_comparator.py +++ b/src/junction_comparator.py @@ -7,6 +7,9 @@ import logging from collections import namedtuple +from Bio.pairwise2 import align +from Bio.Seq import Seq + from src.isoform_assignment import * from src.gene_info import * from src.long_read_profiles import * @@ -15,14 +18,14 @@ logger = logging.getLogger('IsoQuant') -class JunctionComparator(): +class JunctionComparator: absent = -10 def __init__(self, params, intron_profile_constructor): self.params = params self.intron_profile_constructor = intron_profile_constructor - def compare_junctions(self, read_junctions, read_region, isoform_junctions, isoform_region): + def compare_junctions(self, read_junctions, read_region, isoform_junctions, isoform_region, alignment=None): """ compare read splice junctions against similar isoform Parameters @@ -128,7 +131,8 @@ def compare_junctions(self, read_junctions, read_region, isoform_junctions, isof if any(el == -1 for el in read_features_present) or any(el == -1 for el in isoform_features_present): # classify contradictions logger.debug("+ + Classifying contradictions") - matching_events = self.detect_contradiction_type(read_junctions, isoform_junctions, contradictory_region_pairs) + matching_events = self.detect_contradiction_type(read_junctions, isoform_junctions, + contradictory_region_pairs, alignment) if read_features_present[0] == 0 or read_features_present[-1] == 0: if all(x == 0 for x in read_features_present): @@ -141,7 +145,7 @@ def compare_junctions(self, read_junctions, read_region, isoform_junctions, isof return [make_event(MatchEventSubtype.none)] return matching_events - def detect_contradiction_type(self, read_junctions, isoform_junctions, contradictory_region_pairs): + def detect_contradiction_type(self, read_junctions, isoform_junctions, contradictory_region_pairs, alignment): """ Parameters @@ -155,14 +159,16 @@ def detect_contradiction_type(self, read_junctions, isoform_junctions, contradic list of contradiction events """ contradiction_events = [] - for pair in contradictory_region_pairs: + for read_cregion, isoform_cregion in contradictory_region_pairs: # classify each contradictory area separately - event = self.compare_overlapping_contradictional_regions(read_junctions, isoform_junctions, pair[0], pair[1]) + event = self.compare_overlapping_contradictional_regions(read_junctions, isoform_junctions, + read_cregion, isoform_cregion, alignment) contradiction_events.append(event) return contradiction_events - def compare_overlapping_contradictional_regions(self, read_junctions, isoform_junctions, read_cregion, isoform_cregion): + def compare_overlapping_contradictional_regions(self, read_junctions, isoform_junctions, read_cregion, + isoform_cregion, alignment): if read_cregion[0] == self.absent: return make_event(MatchEventSubtype.intron_retention, isoform_cregion[0], read_cregion) elif isoform_cregion[0] == self.absent: @@ -192,7 +198,7 @@ def compare_overlapping_contradictional_regions(self, read_junctions, isoform_ju elif read_cregion[1] == read_cregion[0] and isoform_cregion[1] > isoform_cregion[0]: event = self.classify_skipped_exons(isoform_junctions, isoform_cregion, - total_intron_len_diff, read_introns_known) + total_intron_len_diff, read_introns_known, alignment=alignment) elif read_cregion[1] > read_cregion[0] and isoform_cregion[1] == isoform_cregion[0]: if read_introns_known: @@ -208,11 +214,14 @@ def compare_overlapping_contradictional_regions(self, read_junctions, isoform_ju return make_event(event, isoform_cregion[0], read_cregion) def classify_skipped_exons(self, isoform_junctions, isoform_cregion, - total_intron_len_diff, read_introns_known): + total_intron_len_diff, read_introns_known, alignment=None): total_exon_len = sum([isoform_junctions[i + 1][0] - isoform_junctions[i][1] + 1 for i in range(isoform_cregion[0], isoform_cregion[1])]) if total_intron_len_diff < 2 * self.params.delta and total_exon_len <= self.params.max_missed_exon_len: + print('exon_misaln') + check_misalignment(isoform_junctions, isoform_cregion, + total_intron_len_diff, read_introns_known, alignment) event = MatchEventSubtype.exon_misallignment else: if read_introns_known: @@ -223,6 +232,7 @@ def classify_skipped_exons(self, isoform_junctions, isoform_cregion, def classify_single_intron_alternation(self, read_junctions, isoform_junctions, read_cpos, isoform_cpos, total_intron_len_diff, read_introns_known): + # print('sngl intr', read_junctions, isoform_junctions, read_cpos, isoform_cpos, total_intron_len_diff, read_introns_known) if total_intron_len_diff <= 2 * self.params.delta: if read_introns_known: event = MatchEventSubtype.intron_migration @@ -292,3 +302,19 @@ def profile_for_junctions_introns(self, junctions, region): def are_known_introns(self, junctions, region): selected_junctions_profile = self.profile_for_junctions_introns(junctions, region) return all(el == 1 for el in selected_junctions_profile.read_profile) + + +def check_misalignment(isoform_junctions, isoform_cregion, total_intron_len_diff, read_introns_known, alignment=None): + if alignment is None: + return + + print('iso_junc ', isoform_junctions) + print('iso_creg ', isoform_cregion) + print('total_dif ', total_intron_len_diff) + print('read_intr ', read_introns_known) + ref_seq = Seq(alignment.get_reference_sequence()) + seq = Seq(alignment.get_forward_sequence()) + aligned = align.globalxx(seq, ref_seq) + print('Aligned ', aligned) + + diff --git a/src/long_read_assigner.py b/src/long_read_assigner.py index 6fbb37bb..b48159b5 100644 --- a/src/long_read_assigner.py +++ b/src/long_read_assigner.py @@ -508,10 +508,11 @@ def match_contradictory(self, read_id, combined_read_profile): self.gene_info.split_exon_profiles.profiles, hint=overlapping_isoforms) return self.detect_differences(read_id, read_intron_profile, read_split_exon_profile, - intron_matching_isoforms, exon_matching_isoforms) + intron_matching_isoforms, exon_matching_isoforms, + alignment=combined_read_profile.alignment) def detect_differences(self, read_id, read_intron_profile, read_split_exon_profile, - intron_matching_isoforms, exon_matching_isoforms): + intron_matching_isoforms, exon_matching_isoforms, alignment=None): """ compare read to closest matching isoforms Parameters @@ -521,6 +522,7 @@ def detect_differences(self, read_id, read_intron_profile, read_split_exon_profi read_split_exon_profile: MappedReadProfile intron_matching_isoforms: list of IsoformDiff exon_matching_isoforms: list of IsoformDiff + alignment: pysam AlignedSegment Returns ------- @@ -555,7 +557,7 @@ def detect_differences(self, read_id, read_intron_profile, read_split_exon_profi isoform_region = (isoform_start, isoform_end) matching_events = self.intron_comparator.compare_junctions(read_intron_profile.read_features, read_region, - isoform_introns, isoform_region) + isoform_introns, isoform_region, alignment) if len(matching_events) == 1 and matching_events[0].event_type == MatchEventSubtype.undefined: continue match_classification = MatchClassification.get_contradiction_classification_from_subtypes(matching_events) diff --git a/src/long_read_profiles.py b/src/long_read_profiles.py index 8007c06c..26274fea 100644 --- a/src/long_read_profiles.py +++ b/src/long_read_profiles.py @@ -36,7 +36,7 @@ def __init__(self, read_intron_profile, read_exon_profile, read_split_exon_profi # accepts sorted gapless alignment blocks class OverlappingFeaturesProfileConstructor: # ignore_terminal -- bool flag, indicates whether to ignore leading and trailing -1s in the profile - def __init__(self, known_features, gene_region, comparator = partial(equal_ranges, delta=0), delta=0): + def __init__(self, known_features, gene_region, comparator=partial(equal_ranges, delta=0), delta=0): self.known_features = known_features self.gene_region = gene_region self.comparator = comparator diff --git a/src/read_mapper.py b/src/read_mapper.py index 61345a4a..54c80bbb 100644 --- a/src/read_mapper.py +++ b/src/read_mapper.py @@ -226,7 +226,7 @@ def align_fasta(aligner, fastq_paths, args, label, out_dir): elif aligner == "minimap2": minimap2_path = get_aligner('minimap2') - additional_options = [] + additional_options = ['--MD'] if args.stranded == 'forward': additional_options.append('-uf') # TODO: add junction bed From 53e21258ebfa3a450a32379b433f6b76e294f101 Mon Sep 17 00:00:00 2001 From: Smetanin Aleksandr Date: Mon, 27 Jul 2020 13:45:03 +0300 Subject: [PATCH 2/6] add intron misalignment detection --- src/junction_comparator.py | 200 ++++++++++++++++++++++++++++++------- 1 file changed, 164 insertions(+), 36 deletions(-) diff --git a/src/junction_comparator.py b/src/junction_comparator.py index 053960ed..441501e6 100644 --- a/src/junction_comparator.py +++ b/src/junction_comparator.py @@ -7,8 +7,8 @@ import logging from collections import namedtuple -from Bio.pairwise2 import align -from Bio.Seq import Seq +import pysam +from Bio import pairwise2 from src.isoform_assignment import * from src.gene_info import * @@ -16,6 +16,7 @@ logger = logging.getLogger('IsoQuant') +pairwise2.MAX_ALIGNMENTS = 1 class JunctionComparator: @@ -24,6 +25,8 @@ class JunctionComparator: def __init__(self, params, intron_profile_constructor): self.params = params self.intron_profile_constructor = intron_profile_constructor + self.misalignment_checker = MisalignmentChecker(params.reference, params.delta, params.max_intron_shift) + # self.exon_misalignment_checker = ExonMisalignmentChecker(params.reference) def compare_junctions(self, read_junctions, read_region, isoform_junctions, isoform_region, alignment=None): """ compare read splice junctions against similar isoform @@ -168,7 +171,7 @@ def detect_contradiction_type(self, read_junctions, isoform_junctions, contradic return contradiction_events def compare_overlapping_contradictional_regions(self, read_junctions, isoform_junctions, read_cregion, - isoform_cregion, alignment): + isoform_cregion, alignment=None): if read_cregion[0] == self.absent: return make_event(MatchEventSubtype.intron_retention, isoform_cregion[0], read_cregion) elif isoform_cregion[0] == self.absent: @@ -186,8 +189,9 @@ def compare_overlapping_contradictional_regions(self, read_junctions, isoform_ju read_introns_known = self.are_known_introns(read_junctions, read_cregion) if read_cregion[1] == read_cregion[0] and isoform_cregion[1] == isoform_cregion[0]: - event = self.classify_single_intron_alternation(read_junctions, isoform_junctions, read_cregion[0], - isoform_cregion[0], total_intron_len_diff, read_introns_known) + event = self.classify_single_intron_alternation( + read_junctions, isoform_junctions, read_cregion[0], isoform_cregion[0], + total_intron_len_diff, read_introns_known, alignment) elif read_cregion[1] - read_cregion[0] == isoform_cregion[1] - isoform_cregion[0] and \ total_intron_len_diff < self.params.delta: @@ -219,40 +223,31 @@ def classify_skipped_exons(self, isoform_junctions, isoform_cregion, for i in range(isoform_cregion[0], isoform_cregion[1])]) if total_intron_len_diff < 2 * self.params.delta and total_exon_len <= self.params.max_missed_exon_len: - print('exon_misaln') - check_misalignment(isoform_junctions, isoform_cregion, - total_intron_len_diff, read_introns_known, alignment) - event = MatchEventSubtype.exon_misallignment + return MatchEventSubtype.exon_misallignment + elif read_introns_known: + return MatchEventSubtype.exon_skipping_known_intron else: - if read_introns_known: - event = MatchEventSubtype.exon_skipping_known_intron - else: - event = MatchEventSubtype.exon_skipping_novel_intron - return event + return MatchEventSubtype.exon_skipping_novel_intron def classify_single_intron_alternation(self, read_junctions, isoform_junctions, read_cpos, isoform_cpos, - total_intron_len_diff, read_introns_known): - # print('sngl intr', read_junctions, isoform_junctions, read_cpos, isoform_cpos, total_intron_len_diff, read_introns_known) + total_intron_len_diff, read_introns_known, alignment=None): if total_intron_len_diff <= 2 * self.params.delta: if read_introns_known: - event = MatchEventSubtype.intron_migration + return MatchEventSubtype.intron_migration else: - if abs(isoform_junctions[isoform_cpos][0] - read_junctions[read_cpos][0]) <= self.params.max_intron_shift: - event = MatchEventSubtype.intron_shift - else: - event = MatchEventSubtype.intron_alternation_novel + return self.misalignment_checker.intron_shift( + read_junctions, isoform_junctions, read_cpos, isoform_cpos, alignment) else: # TODO correct when strand is negative if abs(isoform_junctions[isoform_cpos][0] - read_junctions[read_cpos][0]) < self.params.delta: - event = MatchEventSubtype.alt_acceptor_site_known if read_introns_known \ + return MatchEventSubtype.alt_acceptor_site_known if read_introns_known \ else MatchEventSubtype.alt_acceptor_site_novel elif abs(isoform_junctions[isoform_cpos][1] - read_junctions[read_cpos][1]) < self.params.delta: - event = MatchEventSubtype.alt_donor_site_known if read_introns_known \ + return MatchEventSubtype.alt_donor_site_known if read_introns_known \ else MatchEventSubtype.alt_donor_site_novel else: - event = MatchEventSubtype.intron_alternation_known if read_introns_known \ + return MatchEventSubtype.intron_alternation_known if read_introns_known \ else MatchEventSubtype.intron_alternation_novel - return event def get_mono_exon_subtype(self, read_region, isoform_junctions): if len(isoform_junctions) == 0: @@ -304,17 +299,150 @@ def are_known_introns(self, junctions, region): return all(el == 1 for el in selected_junctions_profile.read_profile) -def check_misalignment(isoform_junctions, isoform_cregion, total_intron_len_diff, read_introns_known, alignment=None): - if alignment is None: - return +class MisalignmentChecker: + def __init__(self, reference, delta, max_intron_shift): + if reference is not None: + if not os.path.exists(reference + '.fai'): + logger.info('Building fasta index with samtools') + pysam.faidx(reference) + reference = pysam.FastaFile(reference) + self.reference = reference + self.delta = delta + self.max_intron_shift = max_intron_shift + self.scores = (2, -1, -1, -.2) + + def intron_shift(self, read_junctions, isoform_junctions, read_cpos, isoform_cpos, alignment=None): - print('iso_junc ', isoform_junctions) - print('iso_creg ', isoform_cregion) - print('total_dif ', total_intron_len_diff) - print('read_intr ', read_introns_known) - ref_seq = Seq(alignment.get_reference_sequence()) - seq = Seq(alignment.get_forward_sequence()) - aligned = align.globalxx(seq, ref_seq) - print('Aligned ', aligned) + read_left, read_right = read_junctions[read_cpos] + iso_left, iso_right = isoform_junctions[isoform_cpos] + if abs(read_left - iso_left) + abs(read_right - iso_right) <= 2 * self.delta: + return MatchEventSubtype.intron_shift + + if alignment is None or self.reference is None: + if abs(read_left - iso_left) <= self.max_intron_shift: + return MatchEventSubtype.intron_shift + else: + return MatchEventSubtype.intron_alternation_novel + + read_region, ref_region = self.get_aligned_regions(read_left, read_right, iso_left, iso_right) + seq = self.get_read_sequence(alignment, read_region) + ref_seq = self.reference.fetch(alignment.reference_name, *ref_region) + + if self.sequences_match(seq, ref_seq): + return MatchEventSubtype.intron_shift + return MatchEventSubtype.intron_alternation_novel + + def sequences_match(self, seq, ref_seq): + score, size = 0, 1 + for a in pairwise2.align.globalms(seq, ref_seq, *self.scores): + score, size = a[2], a[4] + if score > 0.7 * size: + return True + return False + + @staticmethod + def get_aligned_regions(read_left, read_right, iso_left, iso_right): + if iso_left <= read_left: + return (iso_left, read_left), (iso_right, read_right) + return (read_right, iso_right), (read_left, iso_left) + + @staticmethod + def get_read_sequence(alignment, ref_region): + ref_start, ref_end = ref_region + seq = '' + last_ref_pos = 0 + for read_pos, ref_pos in alignment.get_aligned_pairs(): + if ref_start <= last_ref_pos <= ref_end: + if read_pos is not None: + seq += alignment.seq[read_pos] + last_ref_pos = ref_pos or last_ref_pos + return seq + + +class ExonMisalignmentChecker: + def __init__(self, reference, win_size=25, match=2, mismatch=-0.2, gap_open=-15, gap_extend=-0.03): + if reference is not None: + if not os.path.exists(reference + '.fai'): + logger.info('Building fasta index with samtools') + pysam.faidx(reference) + reference = pysam.FastaFile(reference) + self.reference = reference + self.win_size = win_size + self.scores = (match, mismatch, gap_open, gap_extend) + + def is_exon_misalignment(self, isoform_junctions, isoform_cregion, alignment=None): + if alignment is None or self.reference is None: + return + + # target reference sequence + ref_start, ref_end = self.get_search_region(isoform_junctions, isoform_cregion) + ref_seq = self.reference.fetch(alignment.reference_name, ref_start, ref_end) + + seq = self.get_read_region(alignment, ref_start, ref_end) + + target_junctions = self.get_target_junctions(isoform_junctions, ref_start, ref_end) + return self.is_misalignment(seq, ref_seq, target_junctions) + + def get_search_region(self, isoform_junctions, isoform_cregion): + ref_search_start = isoform_junctions[max(isoform_cregion[0] - 1, 0)][0] + ref_search_end = isoform_junctions[min(isoform_cregion[1] + 1, len(isoform_junctions) - 1)][1] + return ref_search_start, ref_search_end + + def get_read_region(self, alignment, ref_start, ref_end): + # target read sequence + seq = '' + last_ref_pos = 0 + for read_pos, ref_pos in alignment.get_aligned_pairs(): + if ref_start <= last_ref_pos <= ref_end: + if read_pos is not None: + seq += alignment.seq[read_pos] + last_ref_pos = ref_pos or last_ref_pos + return seq + + def get_target_junctions(self, isoform_junctions, left, right): + return [(a - left, b - left) for a, b in isoform_junctions if left <= a < right] + + def is_misalignment(self, seq, ref_seq, target_junctions): + junctions = None + for a in pairwise2.align.globalms(seq, ref_seq, *self.scores): + junctions = self.get_junctions(a) + if self.similar_junctions(junctions, target_junctions): + return True + # print(junctions) + # print('Target ', target_junctions) + # print('seq ', seq) + # print('ref ', ref_seq) + return False + + def similar_junctions(self, junctions, target_junctions): + if len(junctions) != len(target_junctions): + return False + for i in range(len(junctions)): + aligned_start, aligned_end = junctions[i] + target_start, target_end = target_junctions[i] + if not (aligned_start - self.win_size <= target_start <= aligned_start + self.win_size) \ + or not (aligned_end - self.win_size <= target_end <= aligned_end + self.win_size): + return False + return True + + def get_junctions(self, alignment): + seq, ref_seq = alignment[0], alignment[1] + i = 0 + gap_size = 0 + junctions = [] + intron_start = None + for base, ref_base in zip(seq, ref_seq): + if base == '-': + gap_size += 1 + if gap_size == 10: + intron_start = i - gap_size + 1 + elif gap_size > 10: + junctions.append((intron_start, i)) + intron_start = None + gap_size = 0 + else: + gap_size = 0 + i += 1 + return junctions From dd58f1bcd4a1327412c1517b13bcf26c58b4ed46 Mon Sep 17 00:00:00 2001 From: Smetanin Aleksandr Date: Mon, 27 Jul 2020 13:47:27 +0300 Subject: [PATCH 3/6] remove MD tag --- src/read_mapper.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/read_mapper.py b/src/read_mapper.py index 54c80bbb..61345a4a 100644 --- a/src/read_mapper.py +++ b/src/read_mapper.py @@ -226,7 +226,7 @@ def align_fasta(aligner, fastq_paths, args, label, out_dir): elif aligner == "minimap2": minimap2_path = get_aligner('minimap2') - additional_options = ['--MD'] + additional_options = [] if args.stranded == 'forward': additional_options.append('-uf') # TODO: add junction bed From 21f5e262a254b61ddec85700b0ef95e592e08633 Mon Sep 17 00:00:00 2001 From: Smetanin Aleksandr Date: Mon, 27 Jul 2020 15:40:48 +0300 Subject: [PATCH 4/6] add exon skip misalignment check --- src/junction_comparator.py | 53 ++++++++++++++++++++++++++------------ 1 file changed, 37 insertions(+), 16 deletions(-) diff --git a/src/junction_comparator.py b/src/junction_comparator.py index 441501e6..4740991e 100644 --- a/src/junction_comparator.py +++ b/src/junction_comparator.py @@ -25,7 +25,8 @@ class JunctionComparator: def __init__(self, params, intron_profile_constructor): self.params = params self.intron_profile_constructor = intron_profile_constructor - self.misalignment_checker = MisalignmentChecker(params.reference, params.delta, params.max_intron_shift) + self.misalignment_checker = MisalignmentChecker(params.reference, params.delta, params.max_intron_shift, + params.max_missed_exon_len) # self.exon_misalignment_checker = ExonMisalignmentChecker(params.reference) def compare_junctions(self, read_junctions, read_region, isoform_junctions, isoform_region, alignment=None): @@ -201,7 +202,7 @@ def compare_overlapping_contradictional_regions(self, read_junctions, isoform_ju event = MatchEventSubtype.mutually_exclusive_exons_novel elif read_cregion[1] == read_cregion[0] and isoform_cregion[1] > isoform_cregion[0]: - event = self.classify_skipped_exons(isoform_junctions, isoform_cregion, + event = self.classify_skipped_exons(isoform_junctions, isoform_cregion, read_junctions, read_cregion, total_intron_len_diff, read_introns_known, alignment=alignment) elif read_cregion[1] > read_cregion[0] and isoform_cregion[1] == isoform_cregion[0]: @@ -217,17 +218,14 @@ def compare_overlapping_contradictional_regions(self, read_junctions, isoform_ju event = MatchEventSubtype.alternative_structure_novel return make_event(event, isoform_cregion[0], read_cregion) - def classify_skipped_exons(self, isoform_junctions, isoform_cregion, + def classify_skipped_exons(self, isoform_junctions, isoform_cregion, read_junctions, read_cregion, total_intron_len_diff, read_introns_known, alignment=None): total_exon_len = sum([isoform_junctions[i + 1][0] - isoform_junctions[i][1] + 1 for i in range(isoform_cregion[0], isoform_cregion[1])]) - if total_intron_len_diff < 2 * self.params.delta and total_exon_len <= self.params.max_missed_exon_len: - return MatchEventSubtype.exon_misallignment - elif read_introns_known: - return MatchEventSubtype.exon_skipping_known_intron - else: - return MatchEventSubtype.exon_skipping_novel_intron + return self.misalignment_checker.classify_skipped_exon( + isoform_junctions, isoform_cregion, read_junctions, read_cregion, + total_intron_len_diff, read_introns_known, total_exon_len, alignment) def classify_single_intron_alternation(self, read_junctions, isoform_junctions, read_cpos, isoform_cpos, total_intron_len_diff, read_introns_known, alignment=None): @@ -235,7 +233,7 @@ def classify_single_intron_alternation(self, read_junctions, isoform_junctions, if read_introns_known: return MatchEventSubtype.intron_migration else: - return self.misalignment_checker.intron_shift( + return self.misalignment_checker.classify_intron_misalignment( read_junctions, isoform_junctions, read_cpos, isoform_cpos, alignment) else: # TODO correct when strand is negative @@ -300,7 +298,7 @@ def are_known_introns(self, junctions, region): class MisalignmentChecker: - def __init__(self, reference, delta, max_intron_shift): + def __init__(self, reference, delta, max_intron_shift, max_missed_exon_len): if reference is not None: if not os.path.exists(reference + '.fai'): logger.info('Building fasta index with samtools') @@ -309,9 +307,33 @@ def __init__(self, reference, delta, max_intron_shift): self.reference = reference self.delta = delta self.max_intron_shift = max_intron_shift + self.max_missed_exon_len = max_missed_exon_len self.scores = (2, -1, -1, -.2) - def intron_shift(self, read_junctions, isoform_junctions, read_cpos, isoform_cpos, alignment=None): + def classify_skipped_exon(self, isoform_junctions, isoform_cregion, read_junctions, read_cregion, + total_intron_len_diff, read_introns_known, total_exon_len, alignment): + assert len(set(read_cregion)) == 1 + + read_cpos = read_cregion[0] + read_left, read_right = read_junctions[read_cpos] + l, r = isoform_cregion + iso_left, iso_right = isoform_junctions[l][0], isoform_junctions[r][1] + exon_left, exon_right = isoform_junctions[l][1], isoform_junctions[r][0] + if total_intron_len_diff < 2 * self.delta and total_exon_len <= self.max_missed_exon_len: + if iso_left - self.delta <= read_left and iso_right + self.delta >= read_right: + seq = self.get_read_sequence(alignment, (iso_left, read_left)) \ + + self.get_read_sequence(alignment, (read_right, iso_right)) + ref_seq = self.reference.fetch(alignment.reference_name, exon_left, exon_right) + if self.sequences_match(seq, ref_seq): + return MatchEventSubtype.exon_misallignment + else: + return MatchEventSubtype.exon_skipping_novel_intron + + if read_introns_known: + return MatchEventSubtype.exon_skipping_known_intron + return MatchEventSubtype.exon_skipping_novel_intron + + def classify_intron_misalignment(self, read_junctions, isoform_junctions, read_cpos, isoform_cpos, alignment=None): read_left, read_right = read_junctions[read_cpos] iso_left, iso_right = isoform_junctions[isoform_cpos] @@ -322,10 +344,9 @@ def intron_shift(self, read_junctions, isoform_junctions, read_cpos, isoform_cpo if alignment is None or self.reference is None: if abs(read_left - iso_left) <= self.max_intron_shift: return MatchEventSubtype.intron_shift - else: - return MatchEventSubtype.intron_alternation_novel + return MatchEventSubtype.intron_alternation_novel - read_region, ref_region = self.get_aligned_regions(read_left, read_right, iso_left, iso_right) + read_region, ref_region = self.get_aligned_regions_intron(read_left, read_right, iso_left, iso_right) seq = self.get_read_sequence(alignment, read_region) ref_seq = self.reference.fetch(alignment.reference_name, *ref_region) @@ -342,7 +363,7 @@ def sequences_match(self, seq, ref_seq): return False @staticmethod - def get_aligned_regions(read_left, read_right, iso_left, iso_right): + def get_aligned_regions_intron(read_left, read_right, iso_left, iso_right): if iso_left <= read_left: return (iso_left, read_left), (iso_right, read_right) return (read_right, iso_right), (read_left, iso_left) From 28a278a38c8be77325e95b53f873b3a56e4df8fb Mon Sep 17 00:00:00 2001 From: Smetanin Aleksandr Date: Mon, 27 Jul 2020 15:42:06 +0300 Subject: [PATCH 5/6] remove old exon checker --- src/junction_comparator.py | 87 -------------------------------------- 1 file changed, 87 deletions(-) diff --git a/src/junction_comparator.py b/src/junction_comparator.py index 4740991e..27c8bebb 100644 --- a/src/junction_comparator.py +++ b/src/junction_comparator.py @@ -380,90 +380,3 @@ def get_read_sequence(alignment, ref_region): last_ref_pos = ref_pos or last_ref_pos return seq - -class ExonMisalignmentChecker: - def __init__(self, reference, win_size=25, match=2, mismatch=-0.2, gap_open=-15, gap_extend=-0.03): - if reference is not None: - if not os.path.exists(reference + '.fai'): - logger.info('Building fasta index with samtools') - pysam.faidx(reference) - reference = pysam.FastaFile(reference) - self.reference = reference - self.win_size = win_size - self.scores = (match, mismatch, gap_open, gap_extend) - - def is_exon_misalignment(self, isoform_junctions, isoform_cregion, alignment=None): - if alignment is None or self.reference is None: - return - - # target reference sequence - ref_start, ref_end = self.get_search_region(isoform_junctions, isoform_cregion) - ref_seq = self.reference.fetch(alignment.reference_name, ref_start, ref_end) - - seq = self.get_read_region(alignment, ref_start, ref_end) - - target_junctions = self.get_target_junctions(isoform_junctions, ref_start, ref_end) - return self.is_misalignment(seq, ref_seq, target_junctions) - - def get_search_region(self, isoform_junctions, isoform_cregion): - ref_search_start = isoform_junctions[max(isoform_cregion[0] - 1, 0)][0] - ref_search_end = isoform_junctions[min(isoform_cregion[1] + 1, len(isoform_junctions) - 1)][1] - return ref_search_start, ref_search_end - - def get_read_region(self, alignment, ref_start, ref_end): - # target read sequence - seq = '' - last_ref_pos = 0 - for read_pos, ref_pos in alignment.get_aligned_pairs(): - if ref_start <= last_ref_pos <= ref_end: - if read_pos is not None: - seq += alignment.seq[read_pos] - last_ref_pos = ref_pos or last_ref_pos - return seq - - def get_target_junctions(self, isoform_junctions, left, right): - return [(a - left, b - left) for a, b in isoform_junctions if left <= a < right] - - def is_misalignment(self, seq, ref_seq, target_junctions): - junctions = None - for a in pairwise2.align.globalms(seq, ref_seq, *self.scores): - junctions = self.get_junctions(a) - if self.similar_junctions(junctions, target_junctions): - return True - # print(junctions) - # print('Target ', target_junctions) - # print('seq ', seq) - # print('ref ', ref_seq) - return False - - def similar_junctions(self, junctions, target_junctions): - if len(junctions) != len(target_junctions): - return False - for i in range(len(junctions)): - aligned_start, aligned_end = junctions[i] - target_start, target_end = target_junctions[i] - if not (aligned_start - self.win_size <= target_start <= aligned_start + self.win_size) \ - or not (aligned_end - self.win_size <= target_end <= aligned_end + self.win_size): - return False - return True - - def get_junctions(self, alignment): - seq, ref_seq = alignment[0], alignment[1] - i = 0 - gap_size = 0 - junctions = [] - intron_start = None - for base, ref_base in zip(seq, ref_seq): - if base == '-': - gap_size += 1 - if gap_size == 10: - intron_start = i - gap_size + 1 - elif gap_size > 10: - junctions.append((intron_start, i)) - intron_start = None - gap_size = 0 - else: - gap_size = 0 - i += 1 - return junctions - From 9138c8c5c97942d354124bee24f3a5bb1f89abc4 Mon Sep 17 00:00:00 2001 From: Smetanin Aleksandr Date: Mon, 27 Jul 2020 15:50:42 +0300 Subject: [PATCH 6/6] add check for ref and alignment --- src/junction_comparator.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/src/junction_comparator.py b/src/junction_comparator.py index 27c8bebb..867c4cf6 100644 --- a/src/junction_comparator.py +++ b/src/junction_comparator.py @@ -320,6 +320,9 @@ def classify_skipped_exon(self, isoform_junctions, isoform_cregion, read_junctio iso_left, iso_right = isoform_junctions[l][0], isoform_junctions[r][1] exon_left, exon_right = isoform_junctions[l][1], isoform_junctions[r][0] if total_intron_len_diff < 2 * self.delta and total_exon_len <= self.max_missed_exon_len: + if alignment is None or self.reference is None: + return MatchEventSubtype.exon_misallignment + if iso_left - self.delta <= read_left and iso_right + self.delta >= read_right: seq = self.get_read_sequence(alignment, (iso_left, read_left)) \ + self.get_read_sequence(alignment, (read_right, iso_right))