citations.bib


@ARTICLE{Akhter2012-an,
  title     = "{PhiSpy}: a novel algorithm for finding prophages in bacterial
               genomes that combines similarity- and composition-based
               strategies",
  author    = "Akhter, Sajia and Aziz, Ramy K and Edwards, Robert A",
  abstract  = "Prophages are phages in lysogeny that are integrated into, and
               replicated as part of, the host bacterial genome. These mobile
               elements can have tremendous impact on their bacterial hosts'
               genomes and phenotypes, which may lead to strain emergence and
               diversification, increased virulence or antibiotic resistance.
               However, finding prophages in microbial genomes remains a
               problem with no definitive solution. The majority of existing
               tools rely on detecting genomic regions enriched in
               protein-coding genes with known phage homologs, which hinders
               the de novo discovery of phage regions. In this study, a
               weighted phage detection algorithm, PhiSpy was developed based
               on seven distinctive characteristics of prophages, i.e. protein
               length, transcription strand directionality, customized AT and
               GC skew, the abundance of unique phage words, phage insertion
               points and the similarity of phage proteins. The first five
               characteristics are capable of identifying prophages without any
               sequence similarity with known phage genes. PhiSpy locates
               prophages by ranking genomic regions enriched in distinctive
               phage traits, which leads to the successful prediction of 94\%
               of prophages in 50 complete bacterial genomes with a 6\%
               false-negative rate and a 0.66\% false-positive rate.",
  journal   = "Nucleic Acids Res.",
  publisher = "Oxford University Press",
  volume    =  40,
  number    =  16,
  pages     = "e126",
  month     =  sep,
  year      =  2012,
  language  = "en"
}

@UNPUBLISHED{Mock2019-kv,
  title    = "Viral host prediction with Deep Learning",
  author   = "Mock, Florian and Viehweger, Adrian and Barth, Emanuel and Marz,
              Manja",
  abstract = "Zoonosis, the natural transmission of infections from animal to
              human, is a far-reaching global problem. The recent outbreaks of
              Zika virus and Ebola virus are examples of viral zoonosis, which
              occur more frequently due to globalization. In case of a virus
              outbreak, it is helpful to know which host organism was the
              original carrier of the virus. Once the reservoir or intermediate
              host is known, it can be isolated to prevent further spreading of
              the viral infection. Recent approaches aim to predict a viral
              host based on the viral genome, often in combination with the
              potential host genome and using arbitrary selected features. This
              methods have a clear limitation in either the amount of different
              hosts they can predict or the accuracy of the prediction. Here,
              we present a fast and accurate deep learning approach for viral
              host prediction, which is based on the viral genome sequence
              only. To assure a high prediction accuracy we developed an
              effective selection approach for the training data, to avoid
              biases due to a highly unbalanced number of known sequences per
              virus-host combinations.We tested our deep neural network on
              three different virus species (influenza A virus, rabies
              lyssavirus, rotavirus A) and reached for each virus species a AUC
              between 0.94 and 0.98, outperforming previous approaches and
              allowing highly accurate predictions while only using fractions
              of the viral genome sequences. We show that deep neural networks
              are suitable to predict the host of a virus, even with a limited
              amount of sequences and highly unbalanced available data. The
              deep neural networks trained for this approach build the core of
              the virus host predicting tool VIDHOP (VIrus Deep learning HOst
              Prediction).",
  journal  = "bioRxiv",
  pages    = "575571",
  month    =  mar,
  year     =  2019,
  language = "en"
}

@UNPUBLISHED{Zhang2019-sk,
  title    = "{PHISDetector}: a web tool to detect diverse in silico phage-host
              interaction signals",
  author   = "Zhang, Fan and Zhou, Fengxia and Gan, Rui and Ren, Chunyan and
              Jia, Yuqiang and Yu, Ling and Huang, Zhiwei",
  abstract = "ABSTRACT Phage-host interactions are appealing systems to study
              co-evolution. Their roles in human health and diseases as well as
              novel therapeutics development also have been increasingly
              emphasized. Meanwhile, such interactions leave signals in
              bacterial and phage genomic sequences, defined as phage-host
              interaction signals (PHIS), allowing us to predict novel
              phage-host interactions. Due to the intrinsic complexity and
              recent emerging of metagenomics sequencing data, there is an
              urgent requirement to develop computational tools to analyze
              massive data and extract meaningful information. Here, we seize
              comprehensive in silico PHIS and utilize sophisticated
              bioinformatics to develop PHISDetector, a web tool to detect and
              systematically study diverse in silico PHIS, including analyses
              for co-occurrence/co-abundance patterns, oligonucleotide
              profile/sequence composition, CRISPR-targeting, prophages, phage
              genome similarity, protein-protein interactions, and special gene
              check. PHISDetector accepts various genomic and metagenomic data
              as input and provides well-designed visualizations and detailed
              data tables to download. Prediction tasks are processed remotely
              by the server using custom python scripts and a series of public
              tools. PHISDetector can be accessed at
              http://www.microbiome-bigdata.com/PHISDetector/index/.",
  journal  = "bioRxiv",
  pages    = "661074",
  month    =  jun,
  year     =  2019,
  language = "en"
}

@UNPUBLISHED{Deaton2017-yh,
  title    = "{PhaMers} identifies novel bacteriophage sequences from
              thermophilic hot springs",
  author   = "Deaton, Jonathan and Yu, Feiqiao Brian and Quake, Stephen R",
  abstract = "Abstract Metagenomic sequencing approaches have become popular
              for the purpose of dissecting environmental microbial diversity,
              leading to the characterization of novel microbial lineages. In
              addition of bacterial and fungal genomes, metagenomic analysis
              can also reveal genomes of viruses that infect microbial cells.
              Because of their small genome size and limited knowledge of phage
              diversity, discovering novel phage sequences from metagenomic
              data is often challenging. Here we describe PhaMers
              (Phagek-Mers). a phage identification tool that uses supervised
              learning to classify metagenomic contigs as phage or non-phage on
              the basis of tetranucleotide frequencies. a technique that does
              not depend on existing gene annotations. PhaMers compares the
              tetranucleotide frequencies of metagenomic contigs to phage and
              bacteria references from online databases. resulting in
              assignments of lower level phage taxonomy based on sequence
              similarity. Using PhaMers. we identified 103 novel phage
              sequences from hot spring samples of Yellowstone National Park
              based on data generated from a microfluidic-based minimetagenomic
              approach. We analyzed assembled contigs over 5 kbp in length
              using PhaMers and compared the results with those generated by
              VirSorter, a publicly available phage identification and
              annotation package. We analyzed the performance of phage genome
              prediction and taxonomic classification using PhaMers. and
              presented putative hosts and taxa for some of the novel phage
              sequences. Finally. mini-metagenomic occurrence profiles of phage
              and prokaryotic genomes were used to verify putative hosts.",
  journal  = "bioRxiv",
  pages    = "169672",
  month    =  jul,
  year     =  2017,
  language = "en"
}


@ARTICLE{Ge2016-bj,
  title    = "{CRISPRdigger}: detecting {CRISPRs} with better direct repeat
              annotations",
  author   = "Ge, Ruiquan and Mai, Guoqin and Wang, Pu and Zhou, Manli and Luo,
              Youxi and Cai, Yunpeng and Zhou, Fengfeng",
  abstract = "Clustered regularly interspaced short palindromic repeats
              (CRISPRs) are important genetic elements in many bacterial and
              archaeal genomes, and play a key role in prokaryote immune
              systems' fight against invasive foreign elements. The CRISPR
              system has also been engineered to facilitate target gene editing
              in eukaryotic genomes. Using the common features of mis-annotated
              CRISPRs in prokaryotic genomes, this study proposed an accurate
              de novo CRISPR annotation program CRISPRdigger, which can take a
              partially assembled genome as its input. A comprehensive
              comparison with the three existing programs demonstrated that
              CRISPRdigger can recover more Direct Repeats (DRs) for CRISPRs
              and achieve a higher accuracy for a query genome. The program was
              implemented by Perl and all the parameters had default values, so
              that a user could annotate CRISPRs in a query genome by supplying
              only a genome sequence in the FASTA format. All the supplementary
              data are available at http://www.healthinformaticslab.org/supp/.",
  journal  = "Sci. Rep.",
  volume   =  6,
  pages    = "32942",
  month    =  sep,
  year     =  2016,
  language = "en"
}

@ARTICLE{Biswas2014-rd,
  title    = "Accurate computational prediction of the transcribed strand of
              {CRISPR} non-coding {RNAs}",
  author   = "Biswas, Ambarish and Fineran, Peter C and Brown, Chris M",
  abstract = "MOTIVATION: CRISPR RNAs (crRNAs) are a type of small non-coding
              RNA that form a key part of an acquired immune system in
              prokaryotes. Specific prediction methods find crRNA-encoding loci
              in nearly half of sequenced bacterial, and three quarters of
              archaeal, species. These Clustered Regularly Interspaced Short
              Palindromic Repeats (CRISPR) arrays consist of repeat elements
              alternating with specific spacers. Generally one strand is
              transcribed, producing long pre-crRNAs, which are processed to
              short crRNAs that base pair with invading nucleic acids to
              facilitate their destruction. No current software for the
              discovery of CRISPR loci predicts the direction of crRNA
              transcription. RESULTS: We have developed an algorithm that
              accurately predicts the strand of the resulting crRNAs. The
              method uses as input CRISPR repeat predictions. CRISPRDirection
              uses parameters that are calculated from the CRISPR repeat
              predictions and flanking sequences, which are combined by
              weighted voting. The prediction may use prior coding sequence
              annotation but this is not required. CRISPRDirection correctly
              predicted the orientation of 94\% of a reference set of arrays.
              AVAILABILITY AND IMPLEMENTATION: The Perl source code is freely
              available from http://bioanalysis.otago.ac.nz/CRISPRDirection.",
  journal  = "Bioinformatics",
  volume   =  30,
  number   =  13,
  pages    = "1805--1813",
  month    =  jul,
  year     =  2014,
  language = "en"
}

@ARTICLE{Alkhnbashi2014-ys,
  title    = "{CRISPRstrand}: predicting repeat orientations to determine the
              {crRNA-encoding} strand at {CRISPR} loci",
  author   = "Alkhnbashi, Omer S and Costa, Fabrizio and Shah, Shiraz A and
              Garrett, Roger A and Saunders, Sita J and Backofen, Rolf",
  abstract = "MOTIVATION: The discovery of CRISPR-Cas systems almost 20 years
              ago rapidly changed our perception of the bacterial and archaeal
              immune systems. CRISPR loci consist of several repetitive DNA
              sequences called repeats, inter-spaced by stretches of variable
              length sequences called spacers. This CRISPR array is transcribed
              and processed into multiple mature RNA species (crRNAs). A single
              crRNA is integrated into an interference complex, together with
              CRISPR-associated (Cas) proteins, to bind and degrade invading
              nucleic acids. Although existing bioinformatics tools can
              recognize CRISPR loci by their characteristic repeat-spacer
              architecture, they generally output CRISPR arrays of ambiguous
              orientation and thus do not determine the strand from which
              crRNAs are processed. Knowledge of the correct orientation is
              crucial for many tasks, including the classification of CRISPR
              conservation, the detection of leader regions, the identification
              of target sites (protospacers) on invading genetic elements and
              the characterization of protospacer-adjacent motifs. RESULTS: We
              present a fast and accurate tool to determine the crRNA-encoding
              strand at CRISPR loci by predicting the correct orientation of
              repeats based on an advanced machine learning approach. Both the
              repeat sequence and mutation information were encoded and
              processed by an efficient graph kernel to learn higher-order
              correlations. The model was trained and tested on curated data
              comprising >4500 CRISPRs and yielded a remarkable performance of
              0.95 AUC ROC (area under the curve of the receiver operator
              characteristic). In addition, we show that accurate orientation
              information greatly improved detection of conserved repeat
              sequence families and structure motifs. We integrated
              CRISPRstrand predictions into our CRISPRmap web server of CRISPR
              conservation and updated the latter to version 2.0. AVAILABILITY:
              CRISPRmap and CRISPRstrand are available at
              http://rna.informatik.uni-freiburg.de/CRISPRmap. SUPPLEMENTARY
              INFORMATION: Supplementary data are available at Bioinformatics
              online.",
  journal  = "Bioinformatics",
  volume   =  30,
  number   =  17,
  pages    = "i489--96",
  month    =  sep,
  year     =  2014,
  language = "en"
}

@ARTICLE{Lange2013-qs,
  title    = "{CRISPRmap}: an automated classification of repeat conservation
              in prokaryotic adaptive immune systems",
  author   = "Lange, Sita J and Alkhnbashi, Omer S and Rose, Dominic and Will,
              Sebastian and Backofen, Rolf",
  abstract = "Central to Clustered Regularly Interspaced Short Palindromic
              Repeat (CRISPR)-Cas systems are repeated RNA sequences that serve
              as Cas-protein-binding templates. Classification is based on the
              architectural composition of associated Cas proteins, considering
              repeat evolution is essential to complete the picture. We
              compiled the largest data set of CRISPRs to date, performed
              comprehensive, independent clustering analyses and identified a
              novel set of 40 conserved sequence families and 33 potential
              structure motifs for Cas-endoribonucleases with some distinct
              conservation patterns. Evolutionary relationships are presented
              as a hierarchical map of sequence and structure similarities for
              both a quick and detailed insight into the diversity of
              CRISPR-Cas systems. In a comparison with Cas-subtypes, I-C, I-E,
              I-F and type II were strongly coupled and the remaining type I
              and type III subtypes were loosely coupled to repeat and Cas1
              evolution, respectively. Subtypes with a strong link to CRISPR
              evolution were almost exclusive to bacteria; nevertheless, we
              identified rare examples of potential horizontal transfer of I-C
              and I-E systems into archaeal organisms. Our easy-to-use web
              server provides an automated assignment of newly sequenced
              CRISPRs to our classification system and enables more informed
              choices on future hypotheses in CRISPR-Cas research:
              http://rna.informatik.uni-freiburg.de/CRISPRmap.",
  journal  = "Nucleic Acids Res.",
  volume   =  41,
  number   =  17,
  pages    = "8034--8044",
  month    =  sep,
  year     =  2013,
  language = "en"
}

@ARTICLE{Edgar2007-wo,
  title    = "{PILER-CR}: fast and accurate identification of {CRISPR} repeats",
  author   = "Edgar, Robert C",
  abstract = "BACKGROUND: Sequencing of prokaryotic genomes has recently
              revealed the presence of CRISPR elements: short, highly conserved
              repeats separated by unique sequences of similar length. The
              distinctive sequence signature of CRISPR repeats can be found
              using general-purpose repeat- or pattern-finding software tools.
              However, the output of such tools is not always ideal for
              studying these repeats, and significant effort is sometimes
              needed to build additional tools and perform manual analysis of
              the output. RESULTS: We present PILER-CR, a program specifically
              designed for the identification and analysis of CRISPR repeats.
              The program executes rapidly, completing a 5 Mb genome in around
              5 seconds on a current desktop computer. We validate the
              algorithm by manual curation and by comparison with published
              surveys of these repeats, finding that PILER-CR has both high
              sensitivity and high specificity. We also present a catalogue of
              putative CRISPR repeats identified in a comprehensive analysis of
              346 prokaryotic genomes. CONCLUSION: PILER-CR is a useful tool
              for rapid identification and classification of CRISPR repeats.
              The software is donated to the public domain. Source code and a
              Linux binary are freely available at
              http://www.drive5.com/pilercr.",
  journal  = "BMC Bioinformatics",
  volume   =  8,
  pages    = "18",
  month    =  jan,
  year     =  2007,
  language = "en"
}

@ARTICLE{Nethery2019-xu,
  title    = "{CRISPR} Visualizer: rapid identification and visualization of
              {CRISPR} loci via an automated high-throughput processing
              pipeline",
  author   = "Nethery, Matthew A and Barrangou, Rodolphe",
  abstract = "A CRISPR locus, defined by an array of repeat and spacer
              elements, constitutes a genetic record of the ceaseless battle
              between bacteria and viruses, showcasing the genomic integration
              of spacers acquired from invasive DNA. In particular, iterative
              spacer acquisitions represent unique evolutionary histories and
              are often useful for high-resolution bacterial genotyping,
              including comparative analysis of closely related organisms,
              clonal lineages, and clinical isolates. Current spacer
              visualization methods are typically tedious and can require
              manual data manipulation and curation, including spacer
              extraction at each CRISPR locus from genomes of interest. Here,
              we constructed a high-throughput extraction pipeline coupled with
              a local web-based visualization tool which enables CRISPR spacer
              and repeat extraction, rapid visualization, graphical comparison,
              and progressive multiple sequence alignment. We present the
              bioinformatic pipeline and investigate the loci of reference
              CRISPR-Cas systems and model organisms in 4 well-characterized
              subtypes. We illustrate how this analysis uncovers the
              evolutionary tracks and homology shared between various organisms
              through visual comparison of CRISPR spacers and repeats, driven
              through progressive alignments. Due to the ability to process
              unannotated genome files with minimal preparation and curation,
              this pipeline can be implemented promptly. Overall, this
              efficient high-throughput solution supports accelerated analysis
              of genomic data sets and enables and expedites genotyping efforts
              based on CRISPR loci.",
  journal  = "RNA Biol.",
  volume   =  16,
  number   =  4,
  pages    = "577--584",
  month    =  apr,
  year     =  2019,
  keywords = "CRISPR spacer; CRISPR-Cas; crRNA; repeat detection; software",
  language = "en"
}

@ARTICLE{Rousseau2009-zv,
  title    = "{CRISPI}: a {CRISPR} interactive database",
  author   = "Rousseau, Christine and Gonnet, Mathieu and Le Romancer, Marc and
              Nicolas, Jacques",
  abstract = "SUMMARY: The CRISPR genomic structures (Clustered Regularly
              Interspaced Short Palindromic Repeats) form a family of repeats
              that is largely present in archaea and frequent in bacteria. On
              the basis of a formal model of CRISPR using very few parameters,
              a systematic study of all their occurrences in all available
              genomes of Archaea and Bacteria has been carried out. This has
              resulted in a relational database, CRISPI, which also includes a
              complete repertory of associated CRISPR-associated genes (CAS). A
              user-friendly web interface with many graphical tools and
              functions allows users to extract results, find CRISPR in
              personal sequences or calculate sequence similarity with spacers.
              AVAILABILITY: CRISPI free access at http://crispi.genouest.org
              CONTACT: croussea@irisa.fr; jnicolas@irisa.fr",
  journal  = "Bioinformatics",
  volume   =  25,
  number   =  24,
  pages    = "3317--3318",
  month    =  dec,
  year     =  2009,
  language = "eng"
}

@INCOLLECTION{Cook2018-go,
  title     = "An Integrative Approach to {Virus--Host} {Protein--Protein}
               Interactions",
  booktitle = "Computational Cell Biology: Methods and Protocols",
  author    = "Cook, Helen V and Jensen, Lars Juhl",
  editor    = "von Stechow, Louise and Santos Delgado, Alberto",
  abstract  = "Since cell regulation and protein expression can be dramatically
               altered upon infection by viruses, studying the mechanisms by
               which viruses infect cells and the regulatory networks they
               disrupt is essential to understanding viral pathogenicity. This
               line of study can also lead to discoveries about the workings of
               host cells themselves. Computational methods are rapidly being
               developed to investigate viral-host interactions, and here we
               highlight recent methods and the insights that they have
               revealed so far, with a particular focus on methods that
               integrate different types of data. We also review the challenges
               of working with viruses compared with traditional cellular
               biology, and the limitations of current experimental and
               informatics methods.",
  publisher = "Springer New York",
  pages     = "175--196",
  year      =  2018,
  address   = "New York, NY"
}


@ARTICLE{Leite2018-yf,
  title    = "Computational prediction of inter-species relationships through
              omics data analysis and machine learning",
  author   = "Leite, Diogo Manuel Carvalho and Brochet, Xavier and Resch,
              Gr{\'e}gory and Que, Yok-Ai and Neves, Aitana and Pe{\~n}a-Reyes,
              Carlos",
  abstract = "BACKGROUND: Antibiotic resistance and its rapid dissemination
              around the world threaten the efficacy of currently-used medical
              treatments and call for novel, innovative approaches to manage
              multi-drug resistant infections. Phage therapy, i.e., the use of
              viruses (phages) to specifically infect and kill bacteria during
              their life cycle, is one of the most promising alternatives to
              antibiotics. It is based on the correct matching between a target
              pathogenic bacteria and the therapeutic phage. Nevertheless,
              correctly matching them is a major challenge. Currently, there is
              no systematic method to efficiently predict whether
              phage-bacterium interactions exist and these pairs must be
              empirically tested in laboratory. Herein, we present our approach
              for developing a computational model able to predict whether a
              given phage-bacterium pair can interact based on their genome.
              RESULTS: Based on public data from GenBank and phagesDB.org, we
              collected more than a thousand positive phage-bacterium
              interactions with their complete genomes. In addition, we
              generated putative negative (i.e., non-interacting) pairs. We
              extracted, from the collected genomes, a set of informative
              features based on the distribution of predictive protein-protein
              interactions and on their primary structure (e.g. amino-acid
              frequency, molecular weight and chemical composition of each
              protein). With these features, we generated multiple candidate
              datasets to train our algorithms. On this base, we built
              predictive models exhibiting predictive performance of around
              90\% in terms of F1-score, sensitivity, specificity, and
              accuracy, obtained on the test set with 10-fold cross-validation.
              CONCLUSION: These promising results reinforce the hypothesis that
              machine learning techniques may produce highly-predictive models
              accelerating the search of interacting phage-bacteria pairs.",
  journal  = "BMC Bioinformatics",
  volume   =  19,
  number   = "Suppl 14",
  pages    = "420",
  month    =  nov,
  year     =  2018,
  keywords = "Health; Machine learning; Phage-therapy; Supervised learning",
  language = "en"
}


@INPROCEEDINGS{Carvalho_Leite2017-eo,
  title     = "Computational Prediction of {Host-Pathogen} Interactions Through
               Omics Data Analysis and Machine Learning",
  booktitle = "Bioinformatics and Biomedical Engineering",
  author    = "Carvalho Leite, Diogo Manuel and Brochet, Xavier and Resch,
               Gr{\'e}gory and Que, Yok-Ai and Neves, Aitana and
               Pe{\~n}a-Reyes, Carlos",
  abstract  = "The emergence and rapid dissemination of antibiotic resistance,
               worldwide, threatens medical progress and calls for innovative
               approaches for the management of multidrug resistant infections.
               Phage-therapy, i.e., the use of viruses (phages) that
               specifically infect and kill bacteria during their life cycle,
               is a re-emerging and promising alternative to solve this
               problem. The success of phage therapy mainly relies on the exact
               matching between the target pathogenic bacteria and the
               therapeutic phage. Currently, there are only a few tools or
               methodologies that efficiently predict phage-bacteria
               interactions suitable for the phage therapy, and the pairs
               phage-bacterium are thus empirically tested in laboratory. In
               this paper we present an original methodology, based on an
               ensemble-learning approach, to predict whether or not a given
               pair of phage-bacteria would interact. Using publicly available
               information from Genbank and phagesdb.org, we assembled a
               dataset containing more than two thousand phage-bacterium
               interactions with their corresponding genomes. A set of
               informative features, extracted from these genomes, form the
               base of the quantitative datasets used to train our predictive
               models. These features include the distribution of predicted
               protein-protein interaction scores, as well as the amino acid
               frequency, the chemical composition, and the molecular weight of
               such proteins. Using an independent test dataset to evaluate the
               performance of our methodology, our approach gets encouraging
               performance with more than 90\% of accuracy, specificity, and
               sensitivity.",
  publisher = "Springer International Publishing",
  pages     = "360--371",
  year      =  2017
}

@ARTICLE{Mariano2017-vs,
  title    = "Structure-based prediction of host-pathogen protein interactions",
  author   = "Mariano, Rachelle and Wuchty, Stefan",
  abstract = "The discovery, validation, and characterization of protein-based
              interactions from different species are crucial for translational
              research regarding a variety of pathogens, ranging from the
              malaria parasite Plasmodium falciparum to HIV-1. Here, we review
              recent advances in the prediction of host-pathogen protein
              interfaces using structural information. In particular, we
              observe that current methods chiefly perform machine learning on
              sequence and domain information to produce large sets of
              candidate interactions that are further assessed and pruned to
              generate final, highly probable sets. Structure-based studies
              have also emphasized the electrostatic properties and
              evolutionary transformations of pathogenic interfaces, supplying
              crucial insight into antigenic determinants and the ways
              pathogens compete for host protein binding. Advancements in
              spectroscopic and crystallographic methods complement the
              aforementioned techniques, permitting the rigorous study of true
              positives at a molecular level. Together, these approaches
              illustrate how protein structure on a variety of levels functions
              coordinately and dynamically to achieve host takeover.",
  journal  = "Curr. Opin. Struct. Biol.",
  volume   =  44,
  pages    = "119--124",
  month    =  jun,
  year     =  2017,
  language = "en"
}

@ARTICLE{Hurwitz2018-tf,
  title    = "Phage hunters: Computational strategies for finding phages in
              large-scale 'omics datasets",
  author   = "Hurwitz, Bonnie L and Ponsero, Alise and Thornton, Jr, James and
              U'Ren, Jana M",
  abstract = "A plethora of tools exist for identifying phage sequences in
              bacterial genomes, single cell amplified genomes, and
              host-associated and environmental metagenomes. Yet because the
              genetics of phages and their hosts are closely intertwined,
              distinguishing viral from bacterial signal remains an ongoing
              challenge. Further the size, quantity and fragmentary nature of
              modern 'omics datasets ushers in a new set of computational
              challenges. Here, we detail the promises and pitfalls of using
              currently available gene-centric or k-mer based tools for
              identifying prophage sequences in genomes and prophage and viral
              contigs in metagenomes. Each of these methods offers a unique
              piece of the puzzle to elucidating the intriguing signatures of
              phage-host coevolution.",
  journal  = "Virus Res.",
  volume   =  244,
  pages    = "110--115",
  month    =  jan,
  year     =  2018,
  keywords = "Bioinformatics; Computational biology; Metagenomics; Phage;
              Prophage; Virus-host coevolution",
  language = "en"
}

@ARTICLE{Amgarten2018-ik,
  title    = "{MARVEL}, a Tool for Prediction of Bacteriophage Sequences in
              Metagenomic Bins",
  author   = "Amgarten, Deyvid and Braga, Lucas P P and da Silva, Aline M and
              Setubal, Jo{\~a}o C",
  abstract = "Here we present MARVEL, a tool for prediction of double-stranded
              DNA bacteriophage sequences in metagenomic bins. MARVEL uses a
              random forest machine learning approach. We trained the program
              on a dataset with 1,247 phage and 1,029 bacterial genomes, and
              tested it on a dataset with 335 bacterial and 177 phage genomes.
              We show that three simple genomic features extracted from contig
              sequences were sufficient to achieve a good performance in
              separating bacterial from phage sequences: gene density, strand
              shifts, and fraction of significant hits to a viral protein
              database. We compared the performance of MARVEL to that of
              VirSorter and VirFinder, two popular programs for predicting
              viral sequences. Our results show that all three programs have
              comparable specificity, but MARVEL achieves much better
              performance on the recall (sensitivity) measure. This means that
              MARVEL should be able to identify many more phage sequences in
              metagenomic bins than heretofore has been possible. In a simple
              test with real data, containing mostly bacterial sequences,
              MARVEL classified 58 out of 209 bins as phage genomes; other
              evidence suggests that 57 of these 58 bins are novel phage
              sequences. MARVEL is freely available at
              https://github.com/LaboratorioBioinformatica/MARVEL.",
  journal  = "Front. Genet.",
  volume   =  9,
  pages    = "304",
  month    =  aug,
  year     =  2018,
  keywords = "machine learning; microbiome; phage; random forest; virus",
  language = "en"
}

@ARTICLE{Galiez2017-xb,
  title    = "{WIsH}: who is the host? Predicting prokaryotic hosts from
              metagenomic phage contigs",
  author   = "Galiez, Clovis and Siebert, Matthias and Enault, Fran{\c c}ois
              and Vincent, Jonathan and S{\"o}ding, Johannes",
  abstract = "Summary: WIsH predicts prokaryotic hosts of phages from their
              genomic sequences. It achieves 63\% mean accuracy when predicting
              the host genus among 20 genera for 3 kbp-long phage contigs. Over
              the best current tool, WisH shows much improved accuracy on phage
              sequences of a few kbp length and runs hundreds of times faster,
              making it suited for metagenomics studies. Availability and
              implementation: OpenMP-parallelized GPL-licensed C ++ code
              available at https://github.com/soedinglab/wish. Contact:
              clovis.galiez@mpibpc.mpg.de or soeding@mpibpc.mpg.de.
              Supplementary information: Supplementary data are available at
              Bioinformatics online.",
  journal  = "Bioinformatics",
  volume   =  33,
  number   =  19,
  pages    = "3113--3114",
  month    =  oct,
  year     =  2017,
  language = "en"
}

@ARTICLE{Villarroel2016-wr,
  title    = "{HostPhinder}: A Phage Host Prediction Tool",
  author   = "Villarroel, Julia and Kleinheinz, Kortine Annina and Jurtz,
              Vanessa Isabell and Zschach, Henrike and Lund, Ole and Nielsen,
              Morten and Larsen, Mette Voldby",
  abstract = "The current dramatic increase of antibiotic resistant bacteria
              has revitalised the interest in bacteriophages as alternative
              antibacterial treatment. Meanwhile, the development of
              bioinformatics methods for analysing genomic data places
              high-throughput approaches for phage characterization within
              reach. Here, we present HostPhinder, a tool aimed at predicting
              the bacterial host of phages by examining the phage genome
              sequence. Using a reference database of 2196 phages with known
              hosts, HostPhinder predicts the host species of a query phage as
              the host of the most genomically similar reference phages. As a
              measure of genomic similarity the number of co-occurring k-mers
              (DNA sequences of length k) is used. Using an independent
              evaluation set, HostPhinder was able to correctly predict host
              genus and species for 81\% and 74\% of the phages respectively,
              giving predictions for more phages than BLAST and significantly
              outperforming BLAST on phages for which both had predictions.
              HostPhinder predictions on phage draft genomes from the INTESTI
              phage cocktail corresponded well with the advertised targets of
              the cocktail. Our study indicates that for most phages genomic
              similarity correlates well with related bacterial hosts.
              HostPhinder is available as an interactive web service [1] and as
              a stand alone download from the Docker registry [2].",
  journal  = "Viruses",
  volume   =  8,
  number   =  5,
  month    =  may,
  year     =  2016,
  keywords = "genome; k-mers; prediction; ``host specificity''",
  language = "en"
}

@ARTICLE{Hayes2017-sq,
  title    = "Metagenomic Approaches to Assess Bacteriophages in Various
              Environmental Niches",
  author   = "Hayes, Stephen and Mahony, Jennifer and Nauta, Arjen and van
              Sinderen, Douwe",
  abstract = "Bacteriophages are ubiquitous and numerous parasites of bacteria
              and play a critical evolutionary role in virtually every
              ecosystem, yet our understanding of the extent of the diversity
              and role of phages remains inadequate for many ecological niches,
              particularly in cases in which the host is unculturable. During
              the past 15 years, the emergence of the field of viral
              metagenomics has drastically enhanced our ability to analyse the
              so-called viral 'dark matter' of the biosphere. Here, we review
              the evolution of viral metagenomic methodologies, as well as
              providing an overview of some of the most significant
              applications and findings in this field of research.",
  journal  = "Viruses",
  volume   =  9,
  number   =  6,
  month    =  may,
  year     =  2017,
  keywords = "marine; microbiota; phage; virome",
  language = "en"
}

@ARTICLE{Mihara2016-oa,
  title    = "Linking Virus Genomes with Host Taxonomy",
  author   = "Mihara, Tomoko and Nishimura, Yosuke and Shimizu, Yugo and
              Nishiyama, Hiroki and Yoshikawa, Genki and Uehara, Hideya and
              Hingamp, Pascal and Goto, Susumu and Ogata, Hiroyuki",
  abstract = "Environmental genomics can describe all forms of
              organisms--cellular and viral--present in a community. The
              analysis of such eco-systems biology data relies heavily on
              reference databases, e.g., taxonomy or gene function databases.
              Reference databases of symbiosis sensu lato, although essential
              for the analysis of organism interaction networks, are lacking.
              By mining existing databases and literature, we here provide a
              comprehensive and manually curated database of taxonomic links
              between viruses and their cellular hosts.",
  journal  = "Viruses",
  volume   =  8,
  number   =  3,
  pages    = "66",
  month    =  mar,
  year     =  2016,
  keywords = "GenomeNet; KEGG; database; genomes; taxonomy; virus-host
              interactions",
  language = "en"
}

@ARTICLE{Ren2017-ef,
  title    = "{VirFinder}: a novel k-mer based tool for identifying viral
              sequences from assembled metagenomic data",
  author   = "Ren, Jie and Ahlgren, Nathan A and Lu, Yang Young and Fuhrman,
              Jed A and Sun, Fengzhu",
  abstract = "BACKGROUND: Identifying viral sequences in mixed metagenomes
              containing both viral and host contigs is a critical first step
              in analyzing the viral component of samples. Current tools for
              distinguishing prokaryotic virus and host contigs primarily use
              gene-based similarity approaches. Such approaches can
              significantly limit results especially for short contigs that
              have few predicted proteins or lack proteins with similarity to
              previously known viruses. METHODS: We have developed VirFinder,
              the first k-mer frequency based, machine learning method for
              virus contig identification that entirely avoids gene-based
              similarity searches. VirFinder instead identifies viral sequences
              based on our empirical observation that viruses and hosts have
              discernibly different k-mer signatures. VirFinder's performance
              in correctly identifying viral sequences was tested by training
              its machine learning model on sequences from host and viral
              genomes sequenced before 1 January 2014 and evaluating on
              sequences obtained after 1 January 2014. RESULTS: VirFinder had
              significantly better rates of identifying true viral contigs
              (true positive rates (TPRs)) than VirSorter, the current
              state-of-the-art gene-based virus classification tool, when
              evaluated with either contigs subsampled from complete genomes or
              assembled from a simulated human gut metagenome. For example, for
              contigs subsampled from complete genomes, VirFinder had 78-,
              2.4-, and 1.8-fold higher TPRs than VirSorter for 1, 3, and 5 kb
              contigs, respectively, at the same false positive rates as
              VirSorter (0, 0.003, and 0.006, respectively), thus VirFinder
              works considerably better for small contigs than VirSorter.
              VirFinder furthermore identified several recently sequenced virus
              genomes (after 1 January 2014) that VirSorter did not and that
              have no nucleotide similarity to previously sequenced viruses,
              demonstrating VirFinder's potential advantage in identifying
              novel viral sequences. Application of VirFinder to a set of human
              gut metagenomes from healthy and liver cirrhosis patients reveals
              higher viral diversity in healthy individuals than cirrhosis
              patients. We also identified contig bins containing
              crAssphage-like contigs with higher abundance in healthy patients
              and a putative Veillonella genus prophage associated with
              cirrhosis patients. CONCLUSIONS: This innovative k-mer based tool
              complements gene-based approaches and will significantly improve
              prokaryotic viral sequence identification, especially for
              metagenomic-based studies of viral ecology.",
  journal  = "Microbiome",
  volume   =  5,
  number   =  1,
  pages    = "69",
  month    =  jul,
  year     =  2017,
  keywords = "Human gut; Liver cirrhosis; Metagenome; Virus; k-mer",
  language = "en"
}


@ARTICLE{Roux2015-rt,
  title    = "{VirSorter}: mining viral signal from microbial genomic data",
  author   = "Roux, Simon and Enault, Francois and Hurwitz, Bonnie L and
              Sullivan, Matthew B",
  abstract = "Viruses of microbes impact all ecosystems where microbes drive
              key energy and substrate transformations including the oceans,
              humans and industrial fermenters. However, despite this
              recognized importance, our understanding of viral diversity and
              impacts remains limited by too few model systems and reference
              genomes. One way to fill these gaps in our knowledge of viral
              diversity is through the detection of viral signal in microbial
              genomic data. While multiple approaches have been developed and
              applied for the detection of prophages (viral genomes integrated
              in a microbial genome), new types of microbial genomic data are
              emerging that are more fragmented and larger scale, such as
              Single-cell Amplified Genomes (SAGs) of uncultivated organisms or
              genomic fragments assembled from metagenomic sequencing. Here, we
              present VirSorter, a tool designed to detect viral signal in
              these different types of microbial sequence data in both a
              reference-dependent and reference-independent manner, leveraging
              probabilistic models and extensive virome data to maximize
              detection of novel viruses. Performance testing shows that
              VirSorter's prophage prediction capability compares to that of
              available prophage predictors for complete genomes, but is
              superior in predicting viral sequences outside of a host genome
              (i.e., from extrachromosomal prophages, lytic infections, or
              partially assembled prophages). Furthermore, VirSorter
              outperforms existing tools for fragmented genomic and metagenomic
              datasets, and can identify viral signal in assembled sequence
              (contigs) as short as 3kb, while providing near-perfect
              identification (>95\% Recall and 100\% Precision) on contigs of
              at least 10kb. Because VirSorter scales to large datasets, it can
              also be used in ``reverse'' to more confidently identify viral
              sequence in viral metagenomes by sorting away cellular DNA
              whether derived from gene transfer agents, generalized
              transduction or contamination. Finally, VirSorter is made
              available through the iPlant Cyberinfrastructure that provides a
              web-based user interface interconnected with the required
              computing resources. VirSorter thus complements existing prophage
              prediction softwares to better leverage fragmented, SAG and
              metagenomic datasets in a way that will scale to modern
              sequencing. Given these features, VirSorter should enable the
              discovery of new viruses in microbial datasets, and further our
              understanding of uncultivated viral communities across diverse
              ecosystems.",
  journal  = "PeerJ",
  volume   =  3,
  pages    = "e985",
  year     =  2015,
  language = "eng"
}


@UNPUBLISHED{Tampuu2019-iu,
  title    = "{ViraMiner}: Deep Learning on Raw {DNA} Sequences for Identifying
              Viral Genomes in Human Samples",
  author   = "Tampuu, Ardi and Bzhalava, Zurab and Dillner, Joakim and Vicente,
              Raul",
  abstract = "ABSTRACT Despite its clinical importance, detection of highly
              divergent or yet unknown viruses is a major challenge. When human
              samples are sequenced, conventional alignments classify many
              assembled contigs as ``unknown'' since many of the sequences are
              not similar to known genomes. In this work, we developed
              ViraMiner, a deep learning-based method to identify viruses in
              various human biospecimens. ViraMiner contains two branches of
              Convolutional Neural Networks designed to detect both patterns
              and pattern-frequencies on raw metagenomics contigs. The training
              dataset included sequences obtained from 19 metagenomic
              experiments which were analyzed and labeled by BLAST. The model
              achieves significantly improved accuracy compared to other
              machine learning methods for viral genome classification. Using
              300 bp contigs ViraMiner achieves 0.923 area under the ROC curve.
              To our knowledge, this is the first machine learning methodology
              that can detect the presence of viral sequences among raw
              metagenomic contigs from diverse human samples. We suggest that
              the proposed model captures different types of information of
              genome composition, and can be used as a recommendation system to
              further investigate sequences labeled as ``unknown'' by
              conventional alignment methods. Exploring these highly-divergent
              viruses, in turn, can enhance our knowledge of infectious causes
              of diseases.",
  journal  = "bioRxiv",
  pages    = "602656",
  month    =  apr,
  year     =  2019,
  language = "en"
}


@ARTICLE{Zhang2017-ew,
  title    = "Prediction of virus-host infectious association by supervised
              learning methods",
  author   = "Zhang, Mengge and Yang, Lianping and Ren, Jie and Ahlgren, Nathan
              A and Fuhrman, Jed A and Sun, Fengzhu",
  abstract = "BACKGROUND: The study of virus-host infectious association is
              important for understanding the functions and dynamics of
              microbial communities. Both cellular and fractionated viral
              metagenomic data generate a large number of viral contigs with
              missing host information. Although relative simple methods based
              on the similarity between the word frequency vectors of viruses
              and bacterial hosts have been developed to study virus-host
              associations, the problem is significantly understudied. We
              hypothesize that machine learning methods based on word
              frequencies can be efficiently used to study virus-host
              infectious associations. METHODS: We investigate four different
              representations of word frequencies of viral sequences including
              the relative word frequency and three normalized word frequencies
              by subtracting the number of expected from the observed word
              counts. We also study five machine learning methods including
              logistic regression, support vector machine, random forest,
              Gaussian naive Bayes and Bernoulli naive Bayes for separating
              infectious from non-infectious viruses for nine bacterial host
              genera with at least 45 infecting viruses. Area under the
              receiver operating characteristic curve (AUC) is used to compare
              the performance of different machine learning method and feature
              combinations. We then evaluate the performance of the best method
              for the identification of the hosts of contigs in metagenomic
              studies. We also develop a maximum likelihood method to estimate
              the fraction of true infectious viruses for a given host in viral
              tagging experiments. RESULTS: Based on nine bacterial host genera
              with at least 45 infectious viruses, we show that random forest
              together with the relative word frequency vector performs the
              best in identifying viruses infecting particular hosts. For all
              the nine host genera, the AUC is over 0.85 and for five of them,
              the AUC is higher than 0.98 when the word size is 6 indicating
              the high accuracy of using machine learning approaches for
              identifying viruses infecting particular hosts. We also show that
              our method can predict the hosts of viral contigs of length at
              least 1kbps in metagenomic studies with high accuracy. The random
              forest together with word frequency vector outperforms current
              available methods based on Manhattan and [Formula: see text]
              dissimilarity measures. Based on word frequencies, we estimate
              that about 95\% of the identified T4-like viruses in viral
              tagging experiment infect Synechococcus, while only about 29\% of
              the identified non-T4-like viruses and 30\% of the contigs in the
              study potentially infect Synechococcus. CONCLUSIONS: The random
              forest machine learning method together with the relative word
              frequencies as features of viruses can be used to predict viruses
              and viral contigs for specific bacterial hosts. The maximum
              likelihood approach can be used to estimate the fraction of true
              infectious associated viruses in viral tagging experiments.",
  journal  = "BMC Bioinformatics",
  volume   =  18,
  number   = "Suppl 3",
  pages    = "60",
  month    =  mar,
  year     =  2017,
  language = "en"
}

@UNPUBLISHED{Wang2019-oi,
  title    = "A network-based integrated framework for predicting virus-host
              interactions",
  author   = "Wang, Weili and Ren, Jie and Tang, Kujin and Dart, Emily and
              Ignacio-Espinoza, Julio Cesar and Fuhrman, Jed A and Braun,
              Jonathan and Sun, Fengzhu and Ahlgren, Nathan A",
  abstract = "Metagenomic sequencing has greatly enhanced the discovery of
              viral genomic sequences; however it remains challenging to
              identify the host(s) of these new viruses. We developed
              VirHostMatcher-Net, a flexible, network-based, Markov random
              field framework for predicting virus-host interactions using
              multiple, integrated features: CRISPR sequences, sequence
              homology, and alignment-free similarity measures (![Graphic][1]
              and WIsH). Evaluation of this method on a benchmark set of 1,075
              known viruses-host pairs yielded host prediction accuracy of 62\%
              and 85\% at the genus and phylum levels, representing 12-27\% and
              10-18\% improvement respectively over previous single-feature
              prediction approaches. We applied our host-prediction tool to
              three metagenomic virus datasets: human gut crAss-like phages,
              marine viruses, and viruses recovered from globally-distributed,
              diverse habitats. Host predictions were frequently consistent
              with those of previous studies, but more importantly, this new
              tool made many more confident predictions than previous tools, up
              to 6-fold more (n > 60,000), greatly expanding the diversity of
              known virus-host interactions. [1]: /embed/inline-graphic-1.gif",
  journal  = "bioRxiv",
  pages    = "505768",
  month    =  aug,
  year     =  2019,
  language = "en"
}

@ARTICLE{Chibani-Chennoufi2004-dl,
  title    = "Phage-host interaction: an ecological perspective",
  author   = "Chibani-Chennoufi, Sandra and Bruttin, Anne and Dillmann,
              Marie-Lise and Br{\"u}ssow, Harald",
  journal  = "J. Bacteriol.",
  volume   =  186,
  number   =  12,
  pages    = "3677--3686",
  month    =  jun,
  year     =  2004,
  language = "en"
}


@MISC{Edwards_undated-kq,
  title       = "{PhiSpy}",
  author      = "Edwards, Rob",
  abstract    = "Prediction of prophages from bacterial genomes. Contribute to
                 linsalrob/PhiSpy development by creating an account on GitHub.",
  institution = "Github"
}


@ARTICLE{Chibani2019-dh,
  title    = "Classifying the Unclassified: A Phage Classification Method",
  author   = "Chibani, Cynthia Maria and Farr, Anton and Klama, Sandra and
              Dietrich, Sascha and Liesegang, Heiko",
  abstract = "This work reports the method ClassiPhage to classify phage
              genomes using sequence derived taxonomic features. ClassiPhage
              uses a set of phage specific Hidden Markov Models (HMMs)
              generated from clusters of related proteins. The method was
              validated on all publicly available genomes of phages that are
              known to infect Vibrionaceae. The phages belong to the
              well-described phage families of Myoviridae, Podoviridae,
              Siphoviridae, and Inoviridae. The achieved classification is
              consistent with the assignments of the International Committee on
              Taxonomy of Viruses (ICTV), all tested phages were assigned to
              the corresponding group of the ICTV-database. In addition, 44 out
              of 58 genomes of Vibrio phages not yet classified could be
              assigned to a phage family. The remaining 14 genomes may
              represent phages of new families or subfamilies. Comparative
              genomics indicates that the ability of the approach to identify
              and classify phages is correlated to the conserved genomic
              organization. ClassiPhage classifies phages exclusively based on
              genome sequence data and can be applied on distinct phage genomes
              as well as on prophage regions within host genomes. Possible
              applications include (a) classifying phages from assembled
              metagenomes; and (b) the identification and classification of
              integrated prophages and the splitting of phage families into
              subfamilies.",
  journal  = "Viruses",
  volume   =  11,
  number   =  2,
  month    =  feb,
  year     =  2019,
  keywords = "Hidden Markov Models; Inoviridae; Keywords; Myoviridae;
              Podoviridae; Siphoviridae; Vibrionaceae; classification; phages;
              protein coding sequences; vibriophages",
  language = "en"
}

@ARTICLE{Canchaya2003-it,
  title    = "Prophage genomics",
  author   = "Canchaya, Carlos and Proux, Caroline and Fournous, Ghislain and
              Bruttin, Anne and Br{\"u}ssow, Harald",
  abstract = "The majority of the bacterial genome sequences deposited in the
              National Center for Biotechnology Information database contain
              prophage sequences. Analysis of the prophages suggested that
              after being integrated into bacterial genomes, they undergo a
              complex decay process consisting of inactivating point mutations,
              genome rearrangements, modular exchanges, invasion by further
              mobile DNA elements, and massive DNA deletion. We review the
              technical difficulties in defining such altered prophage
              sequences in bacterial genomes and discuss theoretical frameworks
              for the phage-bacterium interaction at the genomic level. The
              published genome sequences from three groups of eubacteria (low-
              and high-G+C gram-positive bacteria and gamma-proteobacteria)
              were screened for prophage sequences. The prophages from
              Streptococcus pyogenes served as test case for theoretical
              predictions of the role of prophages in the evolution of
              pathogenic bacteria. The genomes from further human, animal, and
              plant pathogens, as well as commensal and free-living bacteria,
              were included in the analysis to see whether the same principles
              of prophage genomics apply for bacteria living in different
              ecological niches and coming from distinct phylogenetical
              affinities. The effect of selection pressure on the host
              bacterium is apparently an important force shaping the prophage
              genomes in low-G+C gram-positive bacteria and
              gamma-proteobacteria.",
  journal  = "Microbiol. Mol. Biol. Rev.",
  volume   =  67,
  number   =  2,
  pages    = "238--76, table of contents",
  month    =  jun,
  year     =  2003,
  language = "en"
}

% The entry below contains non-ASCII chars that could not be converted
% to a LaTeX equivalent.
@ARTICLE{Casjens2003-hk,
  title     = "Prophages and bacterial genomics: what have we learned so far?",
  author    = "Casjens, S",
  abstract  = "Epigraph There is something fascinating about science. One gets
               such wholesale returns of conjecture out of such a trifling
               investment of fact. Mark Twain 1883 Life on the Mississippi
               Summary Bacterial genome nucleotide sequences are being
               completed at a rapid and …",
  journal   = "Mol. Microbiol.",
  publisher = "Wiley Online Library",
  volume    =  49,
  number    =  2,
  pages     = "277--300",
  year      =  2003
}

@ARTICLE{Fouts2006-nt,
  title    = "{Phage\_Finder}: automated identification and classification of
              prophage regions in complete bacterial genome sequences",
  author   = "Fouts, Derrick E",
  abstract = "Phage\_Finder, a heuristic computer program, was created to
              identify prophage regions in completed bacterial genomes. Using a
              test dataset of 42 bacterial genomes whose prophages have been
              manually identified, Phage\_Finder found 91\% of the regions,
              resulting in 7\% false positive and 9\% false negative prophages.
              A search of 302 complete bacterial genomes predicted 403 putative
              prophage regions, accounting for 2.7\% of the total bacterial
              DNA. Analysis of the 285 putative attachment sites revealed tRNAs
              are targets for integration slightly more frequently (33\%) than
              intergenic (31\%) or intragenic (28\%) regions, while tmRNAs were
              targeted in 8\% of the regions. The most popular tRNA targets
              were Arg, Leu, Ser and Thr. Mapping of the insertion point on a
              consensus tRNA molecule revealed novel insertion points on the 5'
              side of the D loop, the 3' side of the anticodon loop and the
              anticodon. A novel method of constructing phylogenetic trees of
              phages and prophages was developed based on the mean of the BLAST
              score ratio (BSR) of the phage/prophage proteomes. This method
              verified many known bacteriophage groups, making this a useful
              tool for predicting the relationships of prophages from bacterial
              genomes.",
  journal  = "Nucleic Acids Res.",
  volume   =  34,
  number   =  20,
  pages    = "5839--5851",
  month    =  oct,
  year     =  2006,
  language = "en"
}

@ARTICLE{Arndt2017-rl,
  title    = "{PHAST}, {PHASTER} and {PHASTEST}: Tools for finding prophage in
              bacterial genomes",
  author   = "Arndt, David and Marcu, Ana and Liang, Yongjie and Wishart, David
              S",
  abstract = "PHAST (PHAge Search Tool) and its successor PHASTER (PHAge Search
              Tool - Enhanced Release) have become two of the most widely used
              web servers for identifying putative prophages in bacterial
              genomes. Here we review the main capabilities of these web
              resources, provide some practical guidance regarding their use
              and discuss possible future improvements. PHAST, which was first
              described in 2011, made its debut just as whole bacterial genome
              sequencing and was becoming inexpensive and relatively routine.
              PHAST quickly gained popularity among bacterial genome
              researchers because of its web accessibility, its ease of use
              along with its enhanced accuracy and rapid processing times.
              PHASTER, which appeared in 2016, provided a number of much-needed
              enhancements to the PHAST server, including greater processing
              speed (to cope with very large submission volumes), increased
              database sizes, a more modern user interface, improved graphical
              displays and support for metagenomic submissions. Continuing
              developments in the field, along with increased interest in
              automated phage and prophage finding, have already led to several
              improvements to the PHASTER server and will soon lead to the
              development of a successor to PHASTER (to be called PHASTEST).",
  journal  = "Brief. Bioinform.",
  month    =  sep,
  year     =  2017,
  keywords = "bacterial genome; bioinformatics; metagenomics; phage; prophage",
  language = "en"
}

@ARTICLE{Arndt2016-bm,
  title    = "{PHASTER}: a better, faster version of the {PHAST} phage search
              tool",
  author   = "Arndt, David and Grant, Jason R and Marcu, Ana and Sajed, Tanvir
              and Pon, Allison and Liang, Yongjie and Wishart, David S",
  abstract = "PHASTER (PHAge Search Tool - Enhanced Release) is a significant
              upgrade to the popular PHAST web server for the rapid
              identification and annotation of prophage sequences within
              bacterial genomes and plasmids. Although the steps in the phage
              identification pipeline in PHASTER remain largely the same as in
              the original PHAST, numerous software improvements and
              significant hardware enhancements have now made PHASTER faster,
              more efficient, more visually appealing and much more user
              friendly. In particular, PHASTER is now 4.3$\times$ faster than
              PHAST when analyzing a typical bacterial genome. More
              specifically, software optimizations have made the backend of
              PHASTER 2.7X faster than PHAST, while the addition of 80 CPUs to
              the PHASTER compute cluster are responsible for the remaining
              speed-up. PHASTER can now process a typical bacterial genome in 3
              min from the raw sequence alone, or in 1.5 min when given a
              pre-annotated GenBank file. A number of other optimizations have
              also been implemented, including automated algorithms to reduce
              the size and redundancy of PHASTER's databases, improvements in
              handling multiple (metagenomic) queries and higher user traffic,
              along with the ability to perform automated look-ups against 14
              000 previously PHAST/PHASTER annotated bacterial genomes (which
              can lead to complete phage annotations in seconds as opposed to
              minutes). PHASTER's web interface has also been entirely
              rewritten. A new graphical genome browser has been added,
              gene/genome visualization tools have been improved, and the
              graphical interface is now more modern, robust and user-friendly.
              PHASTER is available online at www.phaster.ca.",
  journal  = "Nucleic Acids Res.",
  volume   =  44,
  number   = "W1",
  pages    = "W16--21",
  month    =  jul,
  year     =  2016,
  language = "en"
}


@ARTICLE{Von_Meijenfeldt2019-ae,
  title    = "Robust taxonomic classification of uncharted microbial sequences
              and bins with {CAT} and {BAT}",
  author   = "von Meijenfeldt, F A Bastiaan and Arkhipova, Ksenia and Cambuy,
              Diego D and Coutinho, Felipe H and Dutilh, Bas E",
  abstract = "Current-day metagenomics analyses increasingly involve de novo
              taxonomic classification of long DNA sequences and
              metagenome-assembled genomes. Here, we show that the conventional
              best-hit approach often leads to classifications that are too
              specific, especially when the sequences represent novel deep
              lineages. We present a classification method that integrates
              multiple signals to classify sequences (Contig Annotation Tool,
              CAT) and metagenome-assembled genomes (Bin Annotation Tool, BAT).
              Classifications are automatically made at low taxonomic ranks if
              closely related organisms are present in the reference database
              and at higher ranks otherwise. The result is a high
              classification precision even for sequences from considerably
              unknown organisms.",
  journal  = "Genome Biol.",
  volume   =  20,
  number   =  1,
  pages    = "217",
  month    =  oct,
  year     =  2019,
  language = "en"
}


@ARTICLE{Roux2014-dc,
  title    = "Metavir 2: new tools for viral metagenome comparison and
              assembled virome analysis",
  author   = "Roux, Simon and Tournayre, Jeremy and Mahul, Antoine and Debroas,
              Didier and Enault, Fran{\c c}ois",
  abstract = "BACKGROUND: Metagenomics, based on culture-independent
              sequencing, is a well-fitted approach to provide insights into
              the composition, structure and dynamics of environmental viral
              communities. Following recent advances in sequencing
              technologies, new challenges arise for existing bioinformatic
              tools dedicated to viral metagenome (i.e. virome) analysis as (i)
              the number of viromes is rapidly growing and (ii) large genomic
              fragments can now be obtained by assembling the huge amount of
              sequence data generated for each metagenome. RESULTS: To face
              these challenges, a new version of Metavir was developed. First,
              all Metavir tools have been adapted to support comparative
              analysis of viromes in order to improve the analysis of multiple
              datasets. In addition to the sequence comparison previously
              provided, viromes can now be compared through their k-mer
              frequencies, their taxonomic compositions, recruitment plots and
              phylogenetic trees containing sequences from different datasets.
              Second, a new section has been specifically designed to handle
              assembled viromes made of thousands of large genomic fragments
              (i.e. contigs). This section includes an annotation pipeline for
              uploaded viral contigs (gene prediction, similarity search
              against reference viral genomes and protein domains) and an
              extensive comparison between contigs and reference genomes.
              Contigs and their annotations can be explored on the website
              through specifically developed dynamic genomic maps and
              interactive networks. CONCLUSIONS: The new features of Metavir 2
              allow users to explore and analyze viromes composed of raw reads
              or assembled fragments through a set of adapted tools and a
              user-friendly interface.",
  journal  = "BMC Bioinformatics",
  volume   =  15,
  pages    = "76",
  month    =  mar,
  year     =  2014,
  language = "en"
}


@ARTICLE{Galan2019-yn,
  title    = "Host Taxon Predictor - A Tool for Predicting Taxon of the Host of
              a Newly Discovered Virus",
  author   = "Ga{\l}an, Wojciech and B{\k a}k, Maciej and Jakubowska,
              Ma{\l}gorzata",
  abstract = "Recent advances in metagenomics provided a valuable alternative
              to culture-based approaches for better sampling viral diversity.
              However, some of newly identified viruses lack sequence
              similarity to any of previously sequenced ones, and cannot be
              easily assigned to their hosts. Here we present a bioinformatic
              approach to this problem. We developed classifiers capable of
              distinguishing eukaryotic viruses from the phages achieving
              almost 95\% prediction accuracy. The classifiers are wrapped in
              Host Taxon Predictor (HTP) software written in Python which is
              freely available at
              https://github.com/wojciech-galan/viruses\_classifier . HTP's
              performance was later demonstrated on a collection of newly
              identified viral genomes and genome fragments. In summary, HTP is
              a culture- and alignment-free approach for distinction between
              phages and eukaryotic viruses. We have also shown that it is
              possible to further extend our method to go up the evolutionary
              tree and predict whether a virus can infect narrower taxa.",
  journal  = "Sci. Rep.",
  volume   =  9,
  number   =  1,
  pages    = "3436",
  month    =  mar,
  year     =  2019,
  language = "en"
}


% The entry below contains non-ASCII chars that could not be converted
% to a LaTeX equivalent.
@ARTICLE{Ahlgren2017-uu,
  title    = "Alignment-free $d_2^*$ oligonucleotide frequency dissimilarity
              measure improves prediction of hosts from metagenomically-derived
              viral sequences",
  author   = "Ahlgren, Nathan A and Ren, Jie and Lu, Yang Young and Fuhrman,
              Jed A and Sun, Fengzhu",
  abstract = "Viruses and their host genomes often share similar
              oligonucleotide frequency (ONF) patterns, which can be used to
              predict the host of a given virus by finding the host with the
              greatest ONF similarity. We comprehensively compared 11 ONF
              metrics using several k-mer lengths for predicting host taxonomy
              from among ∼32 000 prokaryotic genomes for 1427 virus isolate
              genomes whose true hosts are known. The background-subtracting
              measure [Formula: see text] at k = 6 gave the highest host
              prediction accuracy (33\%, genus level) with reasonable
              computational times. Requiring a maximum dissimilarity score for
              making predictions (thresholding) and taking the consensus of the
              30 most similar hosts further improved accuracy. Using a previous
              dataset of 820 bacteriophage and 2699 bacterial genomes,
              [Formula: see text] host prediction accuracies with thresholding
              and consensus methods (genus-level: 64\%) exceeded previous
              Euclidian distance ONF (32\%) or homology-based (22-62\%)
              methods. When applied to metagenomically-assembled marine SUP05
              viruses and the human gut virus crAssphage, [Formula: see
              text]-based predictions overlapped (i.e. some same, some
              different) with the previously inferred hosts of these viruses.
              The extent of overlap improved when only using host genomes or
              metagenomic contigs from the same habitat or samples as the query
              viruses. The [Formula: see text] ONF method will greatly improve
              the characterization of novel, metagenomic viruses.",
  journal  = "Nucleic Acids Res.",
  volume   =  45,
  number   =  1,
  pages    = "39--53",
  month    =  jan,
  year     =  2017,
  language = "en"
}

@ARTICLE{De_Sousa2018-cz,
  title    = "{PhageWeb} - Web Interface for Rapid Identification and
              Characterization of Prophages in Bacterial Genomes",
  author   = "de Sousa, Ailton Lopes and Mau{\'e}s, Dener and Lobato,
              Am{\'a}lia and Franco, Edian F and Pinheiro, Kenny and
              Ara{\'u}jo, Fabr{\'\i}cio and Pantoja, Yan and da Costa da Silva,
              Artur Luiz and Morais, Jefferson and Ramos, Rommel T J",
  abstract = "This study developed a computational tool with a graphical
              interface and a web-service that allows the identification of
              phage regions through homology search and gene clustering. It
              uses G+C content variation evaluation and tRNA prediction sites
              as evidence to reinforce the presence of prophages in
              indeterminate regions. Also, it performs the functional
              characterization of the prophages regions through data
              integration of biological databases. The performance of PhageWeb
              was compared to other available tools (PHASTER, Prophinder, and
              PhiSpy) using Sensitivity (Sn) and Positive Predictive Value
              (PPV) tests. As a reference for the tests, more than 80 manually
              annotated genomes were used. In the PhageWeb analysis, the Sn
              index was 86.1\% and the PPV was approximately 87\%, while the
              second best tool presented Sn and PPV values of 83.3 and 86.5\%,
              respectively. These numbers allowed us to observe a greater
              precision in the regions identified by PhageWeb while compared to
              other prediction tools submitted to the same tests. Additionally,
              PhageWeb was much faster than the other computational
              alternatives, decreasing the processing time to approximately
              one-ninth of the time required by the second best software.
              PhageWeb is freely available at
              http://computationalbiology.ufpa.br/phageweb.",
  journal  = "Front. Genet.",
  volume   =  9,
  pages    = "644",
  month    =  dec,
  year     =  2018,
  keywords = "bacterial genome; characterization; clustering; phage; prophage;
              web interface; web service",
  language = "en"
}

@ARTICLE{Garneau2017-fa,
  title    = "{PhageTerm}: a tool for fast and accurate determination of phage
              termini and packaging mechanism using next-generation sequencing
              data",
  author   = "Garneau, Julian R and Depardieu, Florence and Fortier,
              Louis-Charles and Bikard, David and Monot, Marc",
  abstract = "The worrying rise of antibiotic resistance in pathogenic bacteria
              is leading to a renewed interest in bacteriophages as a treatment
              option. Novel sequencing technologies enable description of an
              increasing number of phage genomes, a critical piece of
              information to understand their life cycle, phage-host
              interactions, and evolution. In this work, we demonstrate how it
              is possible to recover more information from sequencing data than
              just the phage genome. We developed a theoretical and statistical
              framework to determine DNA termini and phage packaging mechanisms
              using NGS data. Our method relies on the detection of biases in
              the number of reads, which are observable at natural DNA termini
              compared with the rest of the phage genome. We implemented our
              method with the creation of the software PhageTerm and validated
              it using a set of phages with well-established packaging
              mechanisms representative of the termini diversity, i.e. 5'cos
              (Lambda), 3'cos (HK97), pac (P1), headful without a pac site
              (T4), DTR (T7) and host fragment (Mu). In addition, we determined
              the termini of nine Clostridium difficile phages and six phages
              whose sequences were retrieved from the Sequence Read Archive.
              PhageTerm is freely available
              (https://sourceforge.net/projects/phageterm), as a Galaxy
              ToolShed and on a Galaxy-based server
              (https://galaxy.pasteur.fr).",
  journal  = "Sci. Rep.",
  volume   =  7,
  number   =  1,
  pages    = "8292",
  month    =  aug,
  year     =  2017,
  language = "en"
}


@ARTICLE{Deng2014-we,
  title    = "Viral tagging reveals discrete populations in Synechococcus viral
              genome sequence space",
  author   = "Deng, Li and Ignacio-Espinoza, J Cesar and Gregory, Ann C and
              Poulos, Bonnie T and Weitz, Joshua S and Hugenholtz, Philip and
              Sullivan, Matthew B",
  abstract = "Microbes and their viruses drive myriad processes across
              ecosystems ranging from oceans and soils to bioreactors and
              humans. Despite this importance, microbial diversity is only now
              being mapped at scales relevant to nature, while the viral
              diversity associated with any particular host remains little
              researched. Here we quantify host-associated viral diversity
              using viral-tagged metagenomics, which links viruses to specific
              host cells for high-throughput screening and sequencing. In a
              single experiment, we screened 10(7) Pacific Ocean viruses
              against a single strain of Synechococcus and found that naturally
              occurring cyanophage genome sequence space is statistically
              clustered into discrete populations. These population-based,
              host-linked viral ecological data suggest that, for this single
              host and seawater sample alone, there are at least 26
              double-stranded DNA viral populations with estimated relative
              abundances ranging from 0.06 to 18.2\%. These populations include
              previously cultivated cyanophage and new viral types missed by
              decades of isolate-based studies. Nucleotide identities of
              homologous genes mostly varied by less than 1\% within
              populations, even in hypervariable genome regions, and by 42-71\%
              between populations, which provides benchmarks for viral
              metagenomics and genome-based viral species definitions. Together
              these findings showcase a new approach to viral ecology that
              quantitatively links objectively defined environmental viral
              populations, and their genomes, to their hosts.",
  journal  = "Nature",
  volume   =  513,
  number   =  7517,
  pages    = "242--245",
  month    =  sep,
  year     =  2014,
  language = "eng"
}

@ARTICLE{Dzunkova2019-di,
  title    = "Defining the human gut host-phage network through single-cell
              viral tagging",
  author   = "D{\v z}unkov{\'a}, M{\'a}ria and Low, Soo Jen and Daly, Joshua N
              and Deng, Li and Rinke, Christian and Hugenholtz, Philip",
  abstract = "Viral discovery is accelerating at an unprecedented rate due to
              continuing advances in culture-independent sequence-based
              analyses. One important facet of this discovery is identification
              of the hosts of these recently characterized uncultured viruses.
              To this end, we have adapted the viral tagging approach, which
              bypasses the need for culture-based methods to identify
              host-phage pairings. Fluorescently labelled anonymous virions
              adsorb to unlabelled anonymous bacterial host cells, which are
              then individually sorted as host-phage pairs, followed by genome
              amplification and high-throughput sequencing to establish the
              identities of both the host and the attached virus(es). We
              demonstrate single-cell viral tagging using the faecal
              microbiome, including cross-tagging of viruses and bacteria
              between human subjects. A total of 363 unique host-phage pairings
              were predicted, most of which were subject-specific and involved
              previously uncharacterized viruses despite the majority of their
              bacterial hosts having known taxonomy. One-fifth of these pairs
              were confirmed by multiple individual tagged cells. Viruses
              targeting more than one bacterial species were conspicuously
              absent in the host-phage network, suggesting that phages are not
              major vectors of inter-species horizontal gene transfer in the
              human gut. A high level of cross-reactivity between phages and
              bacteria from different subjects was noted despite
              subject-specific viral profiles, which has implications for
              faecal microbiota transplant therapy.",
  journal  = "Nat Microbiol",
  month    =  aug,
  year     =  2019,
  language = "en"
}

@ARTICLE{Deng2012-mv,
  title    = "Contrasting life strategies of viruses that infect photo- and
              heterotrophic bacteria, as revealed by viral tagging",
  author   = "Deng, Li and Gregory, Ann and Yilmaz, Suzan and Poulos, Bonnie T
              and Hugenholtz, Philip and Sullivan, Matthew B",
  abstract = "Ocean viruses are ubiquitous and abundant and play important
              roles in global biogeochemical cycles by means of their
              mortality, horizontal gene transfer, and manipulation of host
              metabolism. However, the obstacles involved in linking viruses to
              their hosts in a high-throughput manner bottlenecks our ability
              to understand virus-host interactions in complex communities. We
              have developed a method called viral tagging (VT), which combines
              mixtures of host cells and fluorescent viruses with flow
              cytometry. We investigated multiple viruses which infect each of
              two model marine bacteria that represent the slow-growing,
              photoautotrophic genus Synechococcus (Cyanobacteria) and the
              fast-growing, heterotrophic genus Pseudoalteromonas
              (Gammaproteobacteria). Overall, viral tagging results for viral
              infection were consistent with plaque and liquid infection assays
              for cyanobacterial myo-, podo- and siphoviruses and some (myo-
              and podoviruses) but not all (four siphoviruses) heterotrophic
              bacterial viruses. Virus-tagged Pseudoalteromonas organisms were
              proportional to the added viruses under varied infection
              conditions (virus-bacterium ratios), while no more than 50\% of
              the Synechococcus organisms were virus tagged even at viral
              abundances that exceeded (5 to 10$\times$) that of their hosts.
              Further, we found that host growth phase minimally impacts the
              fraction of virus-tagged Synechococcus organisms while greatly
              affecting phage adsorption to Pseudoalteromonas. Together these
              findings suggest that at least two contrasting viral life
              strategies exist in the oceans and that they likely reflect
              adaptation to their host microbes. Looking forward to the point
              at which the virus-tagging signature is well understood (e.g.,
              for Synechococcus), application to natural communities should
              begin to provide population genomic data at the proper scale for
              predictively modeling two of the most abundant biological
              entities on Earth. Viral study suffers from an inability to link
              viruses to hosts en masse, and yet delineating ``who infects
              whom'' is fundamental to viral ecology and predictive modeling.
              This article describes viral tagging-a high-throughput method to
              investigate virus-host interactions by combining the fluorescent
              labeling of viruses for ``tagging'' host cells that can be
              analyzed and sorted using flow cytometry. Two cultivated hosts
              (the cyanobacterium Synechococcus and the gammaproteobacterium
              Pseudoalteromonas) and their viruses (podo-, myo-, and
              siphoviruses) were investigated to validate the method. These
              lab-based experiments indicate that for most virus-host pairings,
              VT (viral tagging) adsorption is equivalent to traditional
              infection by liquid and plaque assays, with the exceptions being
              confined to promiscuous adsorption by Pseudoalteromonas
              siphoviruses. These experiments also reveal variability in life
              strategies across these oceanic virus-host systems with respect
              to infection conditions and host growth status, which highlights
              the need for further model system characterization to break open
              this virus-host interaction ``black box.''",
  journal  = "MBio",
  volume   =  3,
  number   =  6,
  year     =  2012,
  language = "eng"
}


@UNPUBLISHED{Starikova2019-ng,
  title    = "Phigaro: high throughput prophage sequence annotation",
  author   = "Starikova, Elizaveta V and Tikhonova, Polina O and Prianichnikov,
              Nikita A and Rands, Chris M and Zdobnov, Evgeny M and Govorun,
              Vadim M",
  abstract = "Abstract Summary Phigaro is a standalone command-line application
              that is able to detect prophage regions taking raw genome and
              metagenome assemblies as an input. It also produces dynamic
              annotated ``prophage genome maps'' and marks possible transposon
              insertion spots inside prophages. It provides putative taxonomic
              annotations that can distinguish tailed from non-tailed phages.
              It is applicable for mining prophage regions from large
              metagenomic datasets.Availability Source code for Phigaro is
              freely available for download at
              https://github.com/bobeobibo/phigaro along with test data. The
              code is written in Python.",
  journal  = "bioRxiv",
  pages    = "598243",
  month    =  apr,
  year     =  2019,
  language = "en"
}


@ARTICLE{Skennerton2013-jn,
  title    = "Crass: identification and reconstruction of {CRISPR} from
              unassembled metagenomic data",
  author   = "Skennerton, Connor T and Imelfort, Michael and Tyson, Gene W",
  abstract = "Clustered regularly interspaced short palindromic repeats
              (CRISPR) constitute a bacterial and archaeal adaptive immune
              system that protect against bacteriophage (phage). Analysis of
              CRISPR loci reveals the history of phage infections and provides
              a direct link between phage and their hosts. All current tools
              for CRISPR identification have been developed to analyse
              completed genomes and are not well suited to the analysis of
              metagenomic data sets, where CRISPR loci are difficult to
              assemble owing to their repetitive structure and population
              heterogeneity. Here, we introduce a new algorithm, Crass, which
              is designed to identify and reconstruct CRISPR loci from raw
              metagenomic data without the need for assembly or prior knowledge
              of CRISPR in the data set. CRISPR in assembled data are often
              fragmented across many contigs/scaffolds and do not fully
              represent the population heterogeneity of CRISPR loci. Crass
              identified substantially more CRISPR in metagenomes previously
              analysed using assembly-based approaches. Using Crass, we were
              able to detect CRISPR that contained spacers with sequence
              homology to phage in the system, which would not have been
              identified using other approaches. The increased sensitivity,
              specificity and speed of Crass will facilitate comprehensive
              analysis of CRISPRs in metagenomic data sets, increasing our
              understanding of phage-host interactions and co-evolution within
              microbial communities.",
  journal  = "Nucleic Acids Res.",
  volume   =  41,
  number   =  10,
  pages    = "e105",
  month    =  may,
  year     =  2013,
  language = "en"
}

@ARTICLE{Lei2016-ck,
  title    = "Assemble {CRISPRs} from metagenomic sequencing data",
  author   = "Lei, Jikai and Sun, Yanni",
  abstract = "MOTIVATION: Clustered regularly interspaced short palindromic
              repeats and associated proteins (CRISPR-Cas) allows more specific
              and efficient gene editing than all previous genetic engineering
              systems. These exciting discoveries stem from the finding of the
              CRISPR system being an adaptive immune system that protects the
              prokaryotes against exogenous genetic elements such as phages.
              Despite the exciting discoveries, almost all knowledge about
              CRISPRs is based only on microorganisms that can be isolated,
              cultured and sequenced in labs. However, about 95\% of bacterial
              species cannot be cultured in labs. The fast accumulation of
              metagenomic data, which contains DNA sequences of microbial
              species from natural samples, provides a unique opportunity for
              CRISPR annotation in uncultivable microbial species. However, the
              large amount of data, heterogeneous coverage and shared leader
              sequences of some CRISPRs pose challenges for identifying CRISPRs
              efficiently in metagenomic data. RESULTS: In this study, we
              developed a CRISPR finding tool for metagenomic data without
              relying on generic assembly, which is error-prone and
              computationally expensive for complex data. Our tool can run on
              commonly available machines in small labs. It employs properties
              of CRISPRs to decompose generic assembly into local assembly. We
              tested it on both mock and real metagenomic data and benchmarked
              the performance with state-of-the-art tools. AVAILABILITY AND
              IMPLEMENTATION: The source code and the documentation of
              metaCRISPR is available at
              https://github.com/hangelwen/metaCRISPR CONTACT:
              yannisun@msu.edu.",
  journal  = "Bioinformatics",
  volume   =  32,
  number   =  17,
  pages    = "i520--i528",
  month    =  sep,
  year     =  2016,
  language = "en"
}

% The entry below contains non-ASCII chars that could not be converted
% to a LaTeX equivalent.
@ARTICLE{Rho2012-gf,
  title    = "Diverse {CRISPRs} evolving in human microbiomes",
  author   = "Rho, Mina and Wu, Yu-Wei and Tang, Haixu and Doak, Thomas G and
              Ye, Yuzhen",
  abstract = "CRISPR (Clustered Regularly Interspaced Short Palindromic
              Repeats) loci, together with cas (CRISPR-associated) genes, form
              the CRISPR/Cas adaptive immune system, a primary defense strategy
              that eubacteria and archaea mobilize against foreign nucleic
              acids, including phages and conjugative plasmids. Short spacer
              sequences separated by the repeats are derived from foreign DNA
              and direct interference to future infections. The availability of
              hundreds of shotgun metagenomic datasets from the Human
              Microbiome Project (HMP) enables us to explore the distribution
              and diversity of known CRISPRs in human-associated microbial
              communities and to discover new CRISPRs. We propose a targeted
              assembly strategy to reconstruct CRISPR arrays, which
              whole-metagenome assemblies fail to identify. For each known
              CRISPR type (identified from reference genomes), we use its
              direct repeat consensus sequence to recruit reads from each HMP
              dataset and then assemble the recruited reads into CRISPR loci;
              the unique spacer sequences can then be extracted for analysis.
              We also identified novel CRISPRs or new CRISPR variants in
              contigs from whole-metagenome assemblies and used targeted
              assembly to more comprehensively identify these CRISPRs across
              samples. We observed that the distributions of CRISPRs (including
              64 known and 86 novel ones) are largely body-site specific. We
              provide detailed analysis of several CRISPR loci, including novel
              CRISPRs. For example, known streptococcal CRISPRs were identified
              in most oral microbiomes, totaling ∼8,000 unique spacers: samples
              resampled from the same individual and oral site shared the most
              spacers; different oral sites from the same individual shared
              significantly fewer, while different individuals had almost no
              common spacers, indicating the impact of subtle niche differences
              on the evolution of CRISPR defenses. We further demonstrate
              potential applications of CRISPRs to the tracing of rare species
              and the virus exposure of individuals. This work indicates the
              importance of effective identification and characterization of
              CRISPR loci to the study of the dynamic ecology of microbiomes.",
  journal  = "PLoS Genet.",
  volume   =  8,
  number   =  6,
  pages    = "e1002441",
  month    =  jun,
  year     =  2012,
  language = "en"
}

@ARTICLE{Moller2017-au,
  title    = "{MetaCRAST}: reference-guided extraction of {CRISPR} spacers from
              unassembled metagenomes",
  author   = "Moller, Abraham G and Liang, Chun",
  abstract = "Clustered regularly interspaced short palindromic repeat (CRISPR)
              systems are the adaptive immune systems of bacteria and archaea
              against viral infection. While CRISPRs have been exploited as a
              tool for genetic engineering, their spacer sequences can also
              provide valuable insights into microbial ecology by linking
              environmental viruses to their microbial hosts. Despite this
              importance, metagenomic CRISPR detection remains a major
              challenge. Here we present a reference-guided CRISPR spacer
              detection tool (Metagenomic CRISPR Reference-Aided Search
              Tool-MetaCRAST) that constrains searches based on user-specified
              direct repeats (DRs). These DRs could be expected from assembly
              or taxonomic profiles of metagenomes. We compared the performance
              of MetaCRAST to those of two existing metagenomic CRISPR
              detection tools-Crass and MinCED-using both real and simulated
              acid mine drainage (AMD) and enhanced biological phosphorus
              removal (EBPR) metagenomes. Our evaluation shows MetaCRAST
              improves CRISPR spacer detection in real metagenomes compared to
              the de novo CRISPR detection methods Crass and MinCED. Evaluation
              on simulated metagenomes show it performs better than de novo
              tools for Illumina metagenomes and comparably for 454
              metagenomes. It also has comparable performance dependence on
              read length and community composition, run time, and accuracy to
              these tools. MetaCRAST is implemented in Perl, parallelizable
              through the Many Core Engine (MCE), and takes metagenomic
              sequence reads and direct repeat queries (FASTA or FASTQ) as
              input. It is freely available for download at
              https://github.com/molleraj/MetaCRAST.",
  journal  = "PeerJ",
  volume   =  5,
  pages    = "e3788",
  month    =  sep,
  year     =  2017,
  keywords = "CRISPR; Metagenomics; Microbial ecology; Repetitive sequences;
              Virus-host interactions",
  language = "en"
}