Source code for k1lib.cli.gb

# AUTOGENERATED FILE! PLEASE DON'T EDIT HERE. EDIT THE SOURCE NOTEBOOKS INSTEAD
"""All tools related to GenBank file format. Expected to use behind the
"gb" module name, like this::

    from k1lib.imports import *
    cat("abc.gb") | gb.feats()
"""
from k1lib import cli
from typing import Any, Union, List
__all__ = ["feats", "origin"]

[docs]
class feats(cli.BaseCli):                                                        # feats
    """Fetches features, each on a separate stream.
Example::

    cat("a.gb") | gb.feats()

Output example::

    [['     source          1..248956422',
      '                     /organism="Homo sapiens"',
      '                     /mol_type="genomic DNA"',
      '                     /db_xref="taxon:9606"',
      '                     /chromosome="1"'],
     ['     gene            11874..14409',
      '                     /gene="DDX11L1"',
      '                     /note="DEAD/H-box helicase 11 like 1 (pseudogene); Derived',
      '                     by automated computational analysis using gene prediction',
      '                     method: BestRefSeq."',
      '                     /pseudo',
      '                     /db_xref="GeneID:100287102"',
      '                     /db_xref="HGNC:HGNC:37102"']]
"""                                                                              # feats

[docs]
    def __ror__(self, it):                                                       # feats
        it = it | cli.grep("FEATURES", 0, 1e9).till("ORIGIN") | cli.rows()[1:-1] # feats
        cache = []                                                               # feats
        for line in it:                                                          # feats
            if line[4:9] != "     ": # new section detected                      # feats
                if len(cache) > 0: yield cache                                   # feats
                cache = []                                                       # feats
            cache.append(line)                                                   # feats
        if len(cache) > 0: yield iter(cache)                                     # feats


[docs]
    @staticmethod                                                                # feats
    def filt(*terms:str) -> cli.BaseCli:                                         # feats
        """Filters for specific terms in all the features texts. If there
are multiple terms, then filters for first term, then second, then third,
so the term's order might matter to you. Example::

    [['     source          1..248956422',
      '                     /organism="Homo sapiens"',
      '                     /mol_type="genomic DNA"',
      '                     /db_xref="taxon:9606"',
      '                     /chromosome="1"'],
     ['     gene            11874..14409',
      '                     /gene="DDX11L1"',
      '                     /note="DEAD/H-box helicase 11 like 1 (pseudogene); Derived',
      '                     by automated computational analysis using gene prediction',
      '                     method: BestRefSeq."',
      '                     /pseudo',
      '                     /db_xref="GeneID:100287102"',
      '                     /db_xref="HGNC:HGNC:37102"']] | gb.feats.filt("mol_type")

Output::

    [['     source          1..248956422',
      '                     /organism="Homo sapiens"',
      '                     /mol_type="genomic DNA"',
      '                     /db_xref="taxon:9606"',
      '                     /chromosome="1"']]
"""                                                                              # feats
        if len(terms) == 0: return cli.iden()                                    # feats
        if len(terms) > 1: return cli.deref() | cli.init.serial(*(feats.filt(term) for term in terms)) # feats
        return cli.toList().all() | cli.filt(lambda F: F | cli.grep(terms[0]) | cli.shape(0) > 0) # feats


[docs]
    @staticmethod                                                                # feats
    def root() -> cli.BaseCli:                                                   # feats
        """Gets root (top most unnamed tag) of a feature.
Example::

     ['     misc_RNA        complement(join(14362..14829,14970..15038,15796..15947,',
      '                     16607..16765,16858..17055,17233..17368,17606..17742,',
      '                     17915..18061,18268..18366,24738..24891,29321..29370))',
      '                     /gene="WASH7P"',
      '                     /gene_synonym="FAM39F; WASH5P"',
      '                     /product="WASP family homolog 7, pseudogene"',
      '                     /note="Derived by automated computational analysis using',
      '                     gene prediction method: BestRefSeq."',
      '                     /pseudo',
      '                     /transcript_id="NR_024540.1"',
      '                     /db_xref="GeneID:653635"',
      '                     /db_xref="HGNC:HGNC:38034"'] | feats.root()

Output::

    ['misc_RNA',
     ['complement(join(14362..14829,14970..15038,15796..15947,',
      '16607..16765,16858..17055,17233..17368,17606..17742,',
      '17915..18061,18268..18366,24738..24891,29321..29370))']]
"""                                                                              # feats
        return cli.apply(lambda x: x.strip()) | cli.breakIf(lambda x: x.startswith("/")) | ~cli.aS(lambda a, *b: [a.split(" ")[0], [" ".join(a.split(" ")[1:]).strip(), *b]]) # feats


[docs]
    @staticmethod                                                                # feats
    def tags(*tags:List[str]) -> cli.BaseCli:                                    # feats
        """Grabs a list of tags.
Example::

    s = ['     misc_RNA        complement(join(14362..14829,14970..15038,15796..15947,',
         '                     16607..16765,16858..17055,17233..17368,17606..17742,',
         '                     17915..18061,18268..18366,24738..24891,29321..29370))',
         '                     /gene="WASH7P"',
         '                     /gene_synonym="FAM39F; WASH5P"',
         '                     /product="WASP family homolog 7, pseudogene"',
         '                     /note="Derived by automated computational analysis using',
         '                     gene prediction method: BestRefSeq."',
         '                     /pseudo',
         '                     /transcript_id="NR_024540.1"',
         '                     /db_xref="GeneID:653635"',
         '                     /db_xref="HGNC:HGNC:38034"']
    s | feats.tags()

Output::

    [['gene', 'WASH7P'],
     ['gene_synonym', 'FAM39F; WASH5P'],
     ['product', 'WASP family homolog 7, pseudogene'],
     ['note',
      'Derived by automated computational analysis using gene prediction method: BestRefSeq.'],
     ['pseudo', ''],
     ['transcript_id', 'NR_024540.1'],
     ['db_xref', 'GeneID:653635'],
     ['db_xref', 'HGNC:HGNC:38034']]

With filters::

    # returns [['gene', 'WASH7P'], ['db_xref', 'HGNC:HGNC:38034'], ['organism', '']]
    s | feats.tags("gene", "db_xref", "organism")
"""                                                                              # feats
        f = cli.op().strip().all() | cli.grep("^/", sep=True).till() | cli.join(" ").all()\
            | (cli.op().split("=") | ~cli.aS(lambda a, *b: [a[1:], b[0][1:-1] if len(b) > 0 else ""])).all() # feats
        if len(tags) == 0: return f | cli.deref()                                # feats
        def g(it):                                                               # feats
            d = it | f | cli.toDict()                                            # feats
            return [[tag, d[tag] if tag in d else ""] for tag in tags]           # feats
        return cli.aS(g)                                                         # feats



[docs]
class origin(cli.BaseCli):                                                       # origin
    """Return the origin section of the genbank file.
Example::

    # returns single fasta string
    cat("a.gb") | gb.origin()
"""                                                                              # origin

[docs]
    def __ror__(self, it):                                                       # origin
        return it | cli.grep("ORIGIN", 0, 1e9) | ~cli.head(1) | cli.op().strip().all()\
        | cli.op().split(" ").all() | cli.cut()[1:] | cli.join("").all()\
        | cli.op().replace("/", "").all() | cli.join("")                         # origin