#!/usr/bin/env python
# -*- coding: utf-8
"""Get sequences for all HMM hits in a given bin.

   This program takes a profile database, a collection ID, and a bin name, and an
   HMM source, and returnes sequences of HMM hits. This program is useful when you
   want to get actual sequencs for each single-copy gene hit in a particular genome
   bin.
"""

import sys
import argparse

import anvio
import anvio.terminal as terminal
import anvio.ccollections as ccollections

from anvio.errors import ConfigError, FilesNPathsError
from anvio.hmmops import SequencesForHMMHits


__author__ = "A. Murat Eren"
__copyright__ = "Copyright 2015, The anvio Project"
__credits__ = []
__license__ = "GPL 3.0"
__version__ = anvio.__version__
__maintainer__ = "A. Murat Eren"
__email__ = "a.murat.eren@gmail.com"


run = terminal.Run()
progress = terminal.Progress()


def main(args):
    if args.list_available_sources:
        info_table = SequencesForHMMHits(args.annotation_db).search_info_table
        for source in info_table:
            t = info_table[source]
            run.info_single('%s [type: %s] [num genes: %d]' % (source, t['search_type'], len(t['genes'].split(','))))
        sys.exit(0)

    splits_dict = ccollections.GetSplitNamesInBins(args).get_dict()
    run.info('Init', '%d splits in %d bin(s)' % (sum([len(v) for v in splits_dict.values()]), len(splits_dict)))

    sources = set([s.strip() for s in args.hmm_sources.split(',')]) if args.hmm_sources else set([])
    s = SequencesForHMMHits(args.annotation_db, sources = sources)

    hmm_sequences_dict = s.get_hmm_sequences_dict_for_splits(splits_dict)
    run.info('Result', '%d genes for %d source(s)' % (len(hmm_sequences_dict), len(s.sources)))

    s.store_hmm_sequences_into_FASTA(hmm_sequences_dict, args.output_file)
    run.info('Output', args.output_file)


if __name__ == '__main__':
    parser = argparse.ArgumentParser(description="Get sequences for all HMM hits in a given bin")
    parser.add_argument('-p', '--profile-db', metavar = "PROFILE_DB", default = None,
                        help = 'Profile database.')
    parser.add_argument('-a', '--annotation-db', required = True, metavar = 'ANNOTATION_DB',
                        help = 'Annotation database.')
    parser.add_argument('-c', '--collection-id', metavar = 'COLLECTION-ID', default = None,
                        help = 'Collection id to look for bins.')
    parser.add_argument('-b', '--bin-id', required = False, metavar = 'BIN_ID', default = None,
                        help = 'One bin to analayze (you either use this, or use a file to list\
                                each bin id you prefer to analyze).')
    parser.add_argument('-B', '--bin-ids-file', required = False, metavar = 'BINS FILE', default = None,
                        help = 'Text file for bins (each line should be a bin id).')
    parser.add_argument('-s', '--hmm-sources', metavar = 'HMM SOURCE', default = None,
                        help = 'Get sequences for a specific list of HMM sources. You can list one or more\
                                sources by separating them from each other with a comma character (i.e., \
                                "--hmm-sources source_1,source_2,source_3"). If you would like to see a list\
                                of available sources in the annotation database, run this program with\
                                "--list-available-sources" flag.')
    parser.add_argument('-o', '--output-file', metavar = 'FILE NAME', default = 'sequences_for_hmm_hits.fa',
                        help = 'Output file path. Default is %(default)s.')
    parser.add_argument('-l', '--list-available-sources', default = False, action = 'store_true',
                        help = 'List available HMM sources in the profile database and quit.')
    
    args = parser.parse_args() 
    
    try:
        main(args)
    except ConfigError, e:
        print e
        sys.exit(-1)
    except FilesNPathsError, e:
        print e
        sys.exit(-1)
