Source code for macsylib.serialization

#########################################################################
# MacSyLib - Python library to detect macromolecular systems            #
#            in prokaryotes protein dataset using systems modelling     #
#            and similarity search.                                     #
#                                                                       #
# Authors: Sophie Abby, Bertrand Neron                                  #
# Copyright (c) 2014-2025  Institut Pasteur (Paris) and CNRS.           #
# See the COPYRIGHT file for details                                    #
#                                                                       #
# This file is part of MacSyLib package.                                #
#                                                                       #
# MacSyLib is free software: you can redistribute it and/or modify      #
# it under the terms of the GNU General Public License as published by  #
# the Free Software Foundation, either version 3 of the License, or     #
# (at your option) any later version.                                   #
#                                                                       #
# MacSyLib is distributed in the hope that it will be useful,           #
# but WITHOUT ANY WARRANTY; without even the implied warranty of        #
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the          #
# GNU General Public License for more details .                         #
#                                                                       #
# You should have received a copy of the GNU General Public License     #
# along with MacSyLib (COPYING).                                        #
# If not, see <https://www.gnu.org/licenses/>.                          #
#########################################################################

"""
This module focus on the way to serialize the different macsylib outputs
"""

import abc
import typing
from string import Template

from .gene import GeneStatus
from .system import System, RejectedCandidate, LikelySystem, UnlikelySystem, HitSystemTracker
from .solution import Solution
from .hit import Loner, MultiSystem


[docs] class SystemSerializer(metaclass=abc.ABCMeta): """ handle the different way to serialize a system """ @abc.abstractmethod def serialize(self, system: System, hit_system_tracker: HitSystemTracker): pass
[docs] class TxtSystemSerializer(SystemSerializer): """ Handle System serialization in text """
[docs] def serialize(self, system: System, hit_system_tracker: HitSystemTracker) -> str: """ :return: a string representation of system readable by human """ clst = ", ".join(["[" + ", ".join([str((v_h.id, v_h.gene.name, v_h.position)) for v_h in cluster.hits]) + "]" for cluster in system.clusters]) s = f"""system id = {system.id} model = {system.model.fqn} replicon = {system.replicon_name} clusters = {clst} occ = {system.occurrence()} wholeness = {system.wholeness:.3f} loci nb = {system.loci_nb} score = {system.score:.3f} """ for title, genes in (("mandatory", system.mandatory_occ), ("accessory", system.accessory_occ), ("neutral", system.neutral_occ)): s += f"\n{title} genes:\n" for g_name, hits in genes.items(): s += f"\t- {g_name}: {len(hits)} " all_hits_str = [] for h in hits: used_in_systems = [s.id for s in hit_system_tracker[h.hit] if s.model.fqn != system.model.fqn] used_in_systems.sort() if used_in_systems: hit_str = f"{h.gene.name} [{', '.join(used_in_systems)}]" else: hit_str = f"{h.gene.name}" all_hits_str.append(hit_str) s += f'({", ".join(all_hits_str)})\n' return s
[docs] class TsvSystemSerializer(SystemSerializer): """ Handle System serialization in tsv format """ header = "replicon\thit_id\tgene_name\thit_pos\tmodel_fqn" \ "\tsys_id\tsys_loci\tlocus_num\tsys_wholeness\tsys_score\tsys_occ" \ "\thit_gene_ref\thit_status\thit_seq_len\thit_i_eval\thit_score\thit_profile_cov\thit_seq_cov\t" \ "hit_begin_match\thit_end_match\tcounterpart\tused_in" template = Template("$sys_replicon_name\t$mh_id\t$mh_gene_name\t$mh_position\t$sys_model_fqn\t" "$sys_id\t$sys_loci\t$locus_num\t$sys_wholeness\t$sys_score\t" "$sys_occurrence\t$mh_gene_role\t$mh_status\t$mh_seq_length\t$mh_i_eval\t" "$mh_score\t$mh_profile_coverage\t$mh_sequence_coverage\t$mh_begin_match" "\t$mh_end_match\t$mh_counterpart\t$used_in_systems\n")
[docs] def serialize(self, system: System, hit_system_tracker: HitSystemTracker) -> str: """ :param :class:`macsylib.system.System` system: The system to serialize. :param hit_system_tracker: The hit_system_tracker which allow to know for each hit in which system it is implied. :type hit_system_tracker: :class:`macsylib.system.HitSystemTracker` object :return: a serialisation of this system in tabulated separated value format each line represent a hit and have the following structure: .. code-block:: python replicon\\thit_id\\tgene_name\\thit_pos\\tmodel_fqn\\tsys_id\\tsys_loci\\tlocus_num\\tsys_wholeness\\tsys_score \\tsys_occ\\thit_gene_ref.alternate_of\\thit_status\\thit_seq_len\\thit_i_eval\\thit_score\\thit_profile_cov \\thit_seq_cov\\tit_begin_match\\thit_end_match\\tcounterpart\\tused_in_systems """ tsv = '' loci_num = system.loci_num for locus_num, cluster in zip(loci_num, system.clusters): for mh in sorted(cluster.hits, key=lambda mh: mh.position): used_in_systems = [s.id for s in hit_system_tracker[mh.hit] if s.model.fqn != system.model.fqn] used_in_systems.sort() tsv += self.template.substitute( sys_replicon_name=system.replicon_name, mh_id=mh.id, mh_gene_name=mh.gene.name, mh_position=mh.position, sys_model_fqn=system.model.fqn, sys_id=system.id, sys_loci=system.loci_nb, locus_num=locus_num, sys_wholeness=f"{system.wholeness:.3f}", sys_score=f"{system.score:.3f}", sys_occurrence=system.occurrence(), mh_gene_role=mh.gene_ref.alternate_of().name, mh_status=mh.status, mh_seq_length=mh.seq_length, mh_i_eval=mh.i_eval, mh_score=f"{mh.score:.3f}", mh_profile_coverage=f"{mh.profile_coverage:.3f}", mh_sequence_coverage=f"{mh.sequence_coverage:.3f}", mh_begin_match=mh.begin_match, mh_end_match=mh.end_match, mh_counterpart=','.join([h.id for h in mh.counterpart]), used_in_systems=','.join(used_in_systems) ) return tsv
[docs] class TsvSolutionSerializer: """ Handle Solution (list of Systems) serialization in tsv format """ header = 'sol_id\t' + TsvSystemSerializer.header template = Template(f"$$sol_id\t{TsvSystemSerializer.template.template}")
[docs] def serialize(self, solution: Solution, sol_id: int, hit_system_tracker: HitSystemTracker) -> str: """ :param solution: the solution to serialize :param sol_id: the solution identifier :param hit_system_tracker: :return: a serialisation of this solution (a list of systems) in tabulated separated value format each line represent a hit and have the same structure as system serialization :meth:`macsylib.serialization.TsvSystemSerializer.serialize` but with an extra column sol_id which is a technical id to identify the different solutions. """ tsv = '' sys_ser = TsvSystemSerializer() sys_ser.template = self.template for system in solution: sol_temp = Template(sys_ser.serialize(system, hit_system_tracker)) tsv += f"{sol_temp.substitute(sol_id=sol_id)}\n" return tsv
[docs] class TxtLikelySystemSerializer(SystemSerializer): """ Handle System serialization in text """
[docs] def serialize(self, system: LikelySystem, hit_system_tracker: HitSystemTracker): """ :param system: The likely system to serialize. Used only for unordered db-type :param hit_system_tracker: The hit_system_tracker which allow to know for each hit in which system it is implied. :return: a string representation of system readable by human """ hits = ", ".join([str((h.id, h.gene.name, h.position)) for h in system.hits]) if system.forbidden_hits: warning = "WARNING the quorum is reached but there is also some forbidden genes.\n" else: warning = '\n' s = f"""This replicon contains genetic materials needed for system {system.model.fqn} {warning} system id = {system.id} model = {system.model.fqn} replicon = {system.replicon_name} hits = [{hits}] wholeness = {system.wholeness:.3f} """ for title, genes in (("mandatory", system.mandatory_occ), ("accessory", system.accessory_occ), ("neutral", system.neutral_occ), ("forbidden", system.forbidden_occ)): s += f"\n{title} genes:\n" for g_name, hits in genes.items(): s += f"\t- {g_name}: {len(hits)} " all_hits_str = [] for h in hits: used_in_systems = [s.id for s in hit_system_tracker[h.hit] if s.model.fqn != system.model.fqn] used_in_systems.sort() if used_in_systems: hit_str = f"{h.gene.name} [{', '.join(used_in_systems)}]" else: hit_str = f"{h.gene.name}" all_hits_str.append(hit_str) s += f'({", ".join(all_hits_str)})\n' s += "\nUse ordered replicon to have better prediction.\n" return s
[docs] class TsvLikelySystemSerializer(SystemSerializer): """ Handle potential System from unordered replicon serialization in tsv format """ header = "replicon\thit_id\tgene_name\thit_pos\tmodel_fqn\tsys_id\tsys_wholeness" \ "\thit_gene_ref\thit_status\thit_seq_len\thit_i_eval\thit_score\thit_profile_cov\thit_seq_cov\t" \ "hit_begin_match\thit_end_match\tused_in" template = Template("$sys_replicon_name\t$mh_id\t$mh_gene_name\t$mh_position\t$sys_model_fqn\t" "$sys_id\t$sys_wholeness\t" "$mh_gene_role\t$mh_status\t$mh_seq_length\t$mh_i_eval\t" "$mh_score\t$mh_profile_coverage\t$mh_sequence_coverage\t$mh_begin_match" "\t$mh_end_match\t$used_in_systems\n")
[docs] def serialize(self, system: LikelySystem, hit_system_tracker: HitSystemTracker) -> str: """ :param system: The likely system to serialize. Used only for unordered db-type :param hit_system_tracker: The hit_system_tracker which allow to know for each hit in which system it is implied. :return: a serialisation of this system in tabulated separated value format each line represent a hit and have the following structure: .. code-block:: python replicon\\thit_id\\tgene_name\\thit_pos\\tmodel_fqn\\tsys_id\\tsys_wholeness \\thit_gene_ref.alternate_of\\thit_status\\thit_seq_len\\thit_i_eval\\thit_score\\thit_profile_cov \\thit_seq_cov\\tit_begin_match\\thit_end_match\\tused_in_systems :rtype: str """ if system.forbidden_hits: warning = "# WARNING the quorum is reached but there is also some forbidden genes.\n" else: warning = '\n' tsv = f"""# This replicon contains genetic materials needed for system {system.model.fqn} {warning}""" tsv += self.header tsv += '\n' for status in (s.lower() for s in GeneStatus.__members__): try: hits = getattr(system, f"{status}_hits") hits = sorted(hits, key=lambda mh: mh.gene.name) except AttributeError: continue for mh in hits: used_in_systems = [s.id for s in hit_system_tracker[mh.hit] if s.model.fqn != system.model.fqn] used_in_systems.sort() tsv += self.template.substitute( sys_replicon_name=system.replicon_name, mh_id=mh.id, mh_gene_name=mh.gene.name, mh_position=mh.position, sys_model_fqn=system.model.fqn, sys_id=system.id, sys_wholeness=f"{system.wholeness:.3f}", mh_gene_role=mh.gene_ref.alternate_of().name, mh_status=mh.status, mh_seq_length=mh.seq_length, mh_i_eval=mh.i_eval, mh_score=f"{mh.score:.3f}", mh_profile_coverage=f"{mh.profile_coverage:.3f}", mh_sequence_coverage=f"{mh.sequence_coverage:.3f}", mh_begin_match=mh.begin_match, mh_end_match=mh.end_match, used_in_systems=','.join(used_in_systems) ) return tsv
[docs] class TxtUnikelySystemSerializer(SystemSerializer): """ Handle System serialization in text """
[docs] def serialize(self, system: UnlikelySystem) -> str: """ :param system: The unlikely system to serialize. (used only if db-type is "unordered_replicon") :return: a string representation of system readable by human """ hits = ", ".join([str((h.id, h.gene.name, h.position)) for h in system.hits]) reasons = '\n'.join(system.reasons) s = f"""This replicon probably not contains a system {system.model.fqn}: {reasons} system id = {system.id} model = {system.model.fqn} replicon = {system.replicon_name} hits = [{hits}] wholeness = {system.wholeness:.3f} """ for title, genes in (("mandatory", system.mandatory_occ), ("accessory", system.accessory_occ), ("neutral", system.neutral_occ), ("forbidden", system.forbidden_occ)): s += f"\n{title} genes:\n" for g_name, hits in genes.items(): s += f"\t- {g_name}: {len(hits)} " all_hits_str = [f"{h.gene.name}" for h in hits] s += f'({", ".join(all_hits_str)})\n' s += "\nUse ordered replicon to have better prediction.\n" return s
[docs] class TsvSpecialHitSerializer: """ Serialize special hits: :class:`macsylib.hit.Loner` and :class:`macsylib.hit.MultiSystem` in tsv format """
[docs] def serialize(self, best_hits: typing.Iterable[Loner] | typing.Iterable[MultiSystem]): """ :param best_hits: the special hits to serialized :type best_hits: sequence of :class:`macsylib.hit.Loner` or :class:`macsylib.hit.MultiSystem` objects """ s = "" if best_hits: header = "replicon\tmodel_fqn\tfunction\tgene_name\t" \ "hit_id\thit_pos\thit_status\thit_seq_len\t" \ "hit_i_eval\thit_score\thit_profile_cov\t" \ "hit_seq_cov\thit_begin_match\thit_end_match\n" s += header special_hits = set(best_hits) for best_hit in best_hits: special_hits.update(best_hit.counterpart) special_hits = list(special_hits) special_hits.sort(key=lambda h: h.position) for one_hit in special_hits: row = f"{one_hit.replicon_name}\t{one_hit.gene_ref.model.fqn}\t{one_hit.gene_ref.alternate_of().name}\t" \ f"{one_hit.gene_ref.name}\t{one_hit.id}\t{one_hit.position:d}\t{one_hit.status}\t" \ f"{one_hit.seq_length:d}\t{one_hit.i_eval:.3e}\t{one_hit.score:.3f}\t" \ f"{one_hit.profile_coverage:.3f}\t{one_hit.sequence_coverage:.3f}\t" \ f"{one_hit.begin_match:d}\t{one_hit.end_match:d}\n" s += row return s
[docs] class TsvRejectedCandidatesSerializer: """ Serialize Rejected Cluster in tsv format """
[docs] def serialize(self, candidates: list[RejectedCandidate]) -> str: """ :param candidates: list of rejected candidates to serialize """ s = "" if candidates: header = "candidate_id\treplicon\tmodel_fqn\tcluster_id\thit_id\thit_pos\tgene_name\tfunction\treasons\n" s += header for candidate in candidates: reasons = '/'.join(candidate.reasons) for cluster in candidate.clusters: for hit in cluster.hits: row = f"{candidate.id}\t{candidate.replicon_name}\t{candidate.model.fqn}\t" \ f"{cluster.id}\t{hit.id}\t{hit.position}\t{hit.gene_ref.name}" \ f"\t{hit.gene_ref.alternate_of().name}\t" \ f"{reasons}\n" s += row s += '\n' return s