SeqAn3  3.0.3
The Modern C++ library for sequence analysis.
input.hpp
Go to the documentation of this file.
1 // -----------------------------------------------------------------------------------------------------
2 // Copyright (c) 2006-2020, Knut Reinert & Freie Universität Berlin
3 // Copyright (c) 2016-2020, Knut Reinert & MPI für molekulare Genetik
4 // This file may be used, modified and/or redistributed under the terms of the 3-clause BSD-License
5 // shipped with this file and also available at: https://github.com/seqan/seqan3/blob/master/LICENSE.md
6 // -----------------------------------------------------------------------------------------------------
7 
13 #pragma once
14 
15 #include <cassert>
16 #include <seqan3/std/filesystem>
17 #include <fstream>
18 #include <string>
19 #include <variant>
20 #include <vector>
21 
32 #include <seqan3/io/exception.hpp>
42 
43 namespace seqan3
44 {
45 
46 // ----------------------------------------------------------------------------
47 // sequence_file_input_traits
48 // ----------------------------------------------------------------------------
49 
100 template <typename t>
101 SEQAN3_CONCEPT sequence_file_input_traits = requires (t v)
102 {
107 
110 
113 };
115 
116 // ----------------------------------------------------------------------------
117 // sequence_file_input_default_traits
118 // ----------------------------------------------------------------------------
119 
134 {
142 
145 
147  template <typename _sequence_alphabet>
149 
151  using id_alphabet = char;
152 
154  template <typename _id_alphabet>
156 
159 
161  template <typename _quality_alphabet>
163 
165 };
166 
170 {
178 
182 };
183 
184 // ----------------------------------------------------------------------------
185 // sequence_file_input
186 // ----------------------------------------------------------------------------
187 
308 template <
310  detail::fields_specialisation selected_field_ids_ = fields<field::seq, field::id, field::qual>,
312  format_fasta,
313  format_fastq,
315  format_sam>>
317 {
318 public:
324  using traits_type = traits_type_;
326  using selected_field_ids = selected_field_ids_;
328  using valid_formats = valid_formats_;
330  using stream_char_type = char;
332 
337 
338  static_assert([] () constexpr
339  {
340  for (field f : selected_field_ids::as_array)
341  if (!field_ids::contains(f))
342  return false;
343  return true;
344  }(),
345  "You selected a field that is not valid for sequence files, please refer to the documentation "
346  "of sequence_file_input::field_ids for the accepted values.");
347 
348  static_assert([] () constexpr
349  {
353  }(),
354  "You may not select field::seq_qual and either of field::seq and field::qual at the same time.");
355 
362  using sequence_type = typename traits_type::template sequence_container<
363  typename traits_type::sequence_alphabet>;
365  using id_type = typename traits_type::template id_container<
366  typename traits_type::id_alphabet>;
368  using quality_type = typename traits_type::template quality_container<
369  typename traits_type::quality_alphabet>;
371  using sequence_quality_type = typename traits_type::
372  template sequence_container<qualified<typename traits_type::sequence_alphabet,
373  typename traits_type::quality_alphabet>>;
374 
377 
379  using record_type = sequence_record<detail::select_types_with_ids_t<field_types,
380  field_ids,
384 
394  using const_reference = void;
396  using size_type = size_t;
402  using const_iterator = void;
404  using sentinel = std::default_sentinel_t;
406 
421  ~sequence_file_input() = default;
422 
440  selected_field_ids const & SEQAN3_DOXYGEN_ONLY(fields_tag) = selected_field_ids{}) :
442  {
443  primary_stream->rdbuf()->pubsetbuf(stream_buffer.data(), stream_buffer.size());
444  static_cast<std::basic_ifstream<char> *>(primary_stream.get())->open(filename,
445  std::ios_base::in | std::ios::binary);
446 
447  if (!primary_stream->good())
448  throw file_open_error{"Could not open file " + filename.string() + " for reading."};
449 
450  // possibly add intermediate compression stream
452 
453  // initialise format handler or throw if format is not found
454  detail::set_format(format, filename);
455  }
456  /* NOTE(h-2): Curiously we do not need a user-defined deduction guide for the above constructor.
457  * A combination of default template parameters and auto-deduction guides works as expected,
458  * independent of whether the second/optional parameter is specified or not, i.e. it is possible
459  * to auto-deduct and overwrite a single template parameter out of the four if the optional parameter
460  * is specified and use the default otherwise.
461  */
462 
477  template <input_stream stream_t,
478  sequence_file_input_format file_format>
480  requires std::same_as<typename std::remove_reference_t<stream_t>::char_type, stream_char_type>
482  sequence_file_input(stream_t & stream,
483  file_format const & SEQAN3_DOXYGEN_ONLY(format_tag),
484  selected_field_ids const & SEQAN3_DOXYGEN_ONLY(fields_tag) = selected_field_ids{}) :
486  format{detail::sequence_file_input_format_exposer<file_format>{}}
487  {
488  static_assert(list_traits::contains<file_format, valid_formats>,
489  "You selected a format that is not in the valid_formats of this file.");
490 
491  // possibly add intermediate compression stream
493  }
494 
496  template <input_stream stream_t,
497  sequence_file_input_format file_format>
499  requires std::same_as<typename std::remove_reference_t<stream_t>::char_type, stream_char_type>
501  sequence_file_input(stream_t && stream,
502  file_format const & SEQAN3_DOXYGEN_ONLY(format_tag),
503  selected_field_ids const & SEQAN3_DOXYGEN_ONLY(fields_tag) = selected_field_ids{}) :
504  primary_stream{new stream_t{std::move(stream)}, stream_deleter_default},
505  format{detail::sequence_file_input_format_exposer<file_format>{}}
506  {
507  static_assert(list_traits::contains<file_format, valid_formats>,
508  "You selected a format that is not in the valid_formats of this file.");
509 
510  // possibly add intermediate compression stream
512  }
514 
534  {
535  // buffer first record
537  {
539  first_record_was_read = true;
540  }
541 
542  return {*this};
543  }
544 
558  sentinel end() noexcept
559  {
560  return {};
561  }
562 
586  reference front() noexcept
587  {
588  return *begin();
589  }
591 
593  sequence_file_input_options<typename traits_type::sequence_legal_alphabet,
595 
596 protected:
598 
606 
617 
622 
626  bool at_end{false};
627 
634 
637  {
638  // clear the record
640 
641  // at end if we could not read further
644  {
645  at_end = true;
646  return;
647  }
648 
649  assert(!format.valueless_by_exception());
650  std::visit([&] (auto & f)
651  {
652  // read new record
654  {
655  f.read_sequence_record(*secondary_stream,
656  options,
657  detail::get_or_ignore<field::seq_qual>(record_buffer),
658  detail::get_or_ignore<field::id>(record_buffer),
659  detail::get_or_ignore<field::seq_qual>(record_buffer));
660  }
661  else
662  {
663  f.read_sequence_record(*secondary_stream,
664  options,
665  detail::get_or_ignore<field::seq>(record_buffer),
666  detail::get_or_ignore<field::id>(record_buffer),
667  detail::get_or_ignore<field::qual>(record_buffer));
668  }
669  }, format);
670  }
671 
673  friend iterator;
674 };
675 
682 template <input_stream stream_type,
683  sequence_file_input_format file_format>
684 sequence_file_input(stream_type & stream,
685  file_format const &)
687  typename sequence_file_input<>::selected_field_ids, // default field ids.
689 
691 template <input_stream stream_type,
692  sequence_file_input_format file_format>
693 sequence_file_input(stream_type && stream,
694  file_format const &)
696  typename sequence_file_input<>::selected_field_ids, // default field ids.
698 
700 template <input_stream stream_type,
701  sequence_file_input_format file_format,
702  detail::fields_specialisation selected_field_ids>
703 sequence_file_input(stream_type && stream,
704  file_format const &,
705  selected_field_ids const &)
709 
711 template <input_stream stream_type,
712  sequence_file_input_format file_format,
713  detail::fields_specialisation selected_field_ids>
714 sequence_file_input(stream_type & stream,
715  file_format const &,
716  selected_field_ids const &)
721 
722 } // namespace seqan3
Provides seqan3::aa27, container aliases and string literals.
Provides alphabet adaptations for standard char types.
The twenty-seven letter amino acid alphabet.
Definition: aa27.hpp:46
Input iterator necessary for providing a range-like interface in input file.
Definition: in_file_iterator.hpp:41
The 15 letter DNA alphabet, containing all IUPAC smybols minus the gap.
Definition: dna15.hpp:49
The five letter DNA alphabet of A,C,G,T and the unknown character N.
Definition: dna5.hpp:49
The EMBL format.
Definition: format_embl.hpp:73
The FastA format.
Definition: format_fasta.hpp:82
The FastQ format.
Definition: format_fastq.hpp:80
The GenBank format.
Definition: format_genbank.hpp:74
The SAM format (tag).
Definition: format_sam.hpp:128
Quality type for traditional Sanger and modern Illumina Phred scores.
Definition: phred42.hpp:45
Joins an arbitrary alphabet with a quality alphabet.
Definition: qualified.hpp:59
A class for reading sequence files, e.g. FASTA, FASTQ ...
Definition: input.hpp:317
static void stream_deleter_default(std::basic_istream< stream_char_type > *ptr)
Stream deleter with default behaviour (ownership assumed).
Definition: input.hpp:616
sequence_file_input(stream_type &stream, file_format const &) -> sequence_file_input< typename sequence_file_input<>::traits_type, typename sequence_file_input<>::selected_field_ids, type_list< file_format >>
Deduces the sequence input file type from the stream and the format.
typename traits_type::template id_container< typename traits_type::id_alphabet > id_type
The type of field::id (std::string by defaul).
Definition: input.hpp:366
void const_reference
The const_reference type is void, because files are not const-iterable.
Definition: input.hpp:394
std::default_sentinel_t sentinel
The type returned by end().
Definition: input.hpp:404
sequence_file_input(std::filesystem::path filename, selected_field_ids const &fields_tag=selected_field_ids{})
Construct from filename.
Definition: input.hpp:439
type_list< sequence_type, id_type, quality_type, sequence_quality_type > field_types
The previously defined types aggregated in a seqan3::type_list.
Definition: input.hpp:376
sequence_file_input & operator=(sequence_file_input const &)=delete
Copy assignment is explicitly deleted, because you can't have multiple access to the same file.
reference front() noexcept
Return the record we are currently at in the file.
Definition: input.hpp:586
sequence_file_input(stream_type &stream, file_format const &, selected_field_ids const &) -> sequence_file_input< typename sequence_file_input<>::traits_type, selected_field_ids, type_list< file_format >>
This is an overloaded member function, provided for convenience. It differs from the above function o...
iterator begin()
Returns an iterator to current position in the file.
Definition: input.hpp:533
format_type format
The actual std::variant holding a pointer to the detected/selected format.
Definition: input.hpp:632
sequence_file_input & operator=(sequence_file_input &&)=default
Move assignment is defaulted.
sentinel end() noexcept
Returns a sentinel for comparison with iterator.
Definition: input.hpp:558
record_type record_buffer
Buffer for a single record.
Definition: input.hpp:602
char stream_char_type
Character type of the stream(s).
Definition: input.hpp:330
size_t size_type
An unsigned integer type, usually std::size_t.
Definition: input.hpp:396
sequence_file_input(stream_t &stream, file_format const &format_tag, selected_field_ids const &fields_tag=selected_field_ids{})
Construct from an existing stream and with specified format.
Definition: input.hpp:482
stream_ptr_t primary_stream
The primary stream is the user provided stream or the file stream if constructed from filename.
Definition: input.hpp:619
sequence_file_input(sequence_file_input const &)=delete
Copy construction is explicitly deleted, because you can't have multiple access to the same file.
sequence_file_input(stream_type &&stream, file_format const &) -> sequence_file_input< typename sequence_file_input<>::traits_type, typename sequence_file_input<>::selected_field_ids, type_list< file_format >>
This is an overloaded member function, provided for convenience. It differs from the above function o...
sequence_file_input(stream_t &&stream, file_format const &format_tag, selected_field_ids const &fields_tag=selected_field_ids{})
This is an overloaded member function, provided for convenience. It differs from the above function o...
Definition: input.hpp:501
~sequence_file_input()=default
Destructor is defaulted.
sequence_file_input(sequence_file_input &&)=default
Move construction is defaulted.
void const_iterator
The const iterator type is void, because files are not const-iterable.
Definition: input.hpp:402
typename detail::variant_from_tags< valid_formats, detail::sequence_file_input_format_exposer >::type format_type
Type of the format, an std::variant over the valid_formats.
Definition: input.hpp:630
sequence_file_input()=delete
Default constructor is explicitly deleted, you need to give a stream or file name.
typename traits_type::template quality_container< typename traits_type::quality_alphabet > quality_type
The type of field::qual (std::vector <seqan3::phred42> by default).
Definition: input.hpp:369
typename traits_type::template sequence_container< typename traits_type::sequence_alphabet > sequence_type
The type of field::seq (std::vector <seqan3::dna5> by default).
Definition: input.hpp:363
traits_type_ traits_type
A traits type that defines aliases and template for storage of the fields.
Definition: input.hpp:324
selected_field_ids_ selected_field_ids
A seqan3::fields list with the fields selected for the record.
Definition: input.hpp:326
static void stream_deleter_noop(std::basic_istream< stream_char_type > *)
Stream deleter that does nothing (no ownership assumed).
Definition: input.hpp:614
bool first_record_was_read
Tracks whether the very first record is buffered when calling begin().
Definition: input.hpp:624
sequence_file_input_options< typename traits_type::sequence_legal_alphabet, selected_field_ids::contains(field::seq_qual)> options
The options are public and its members can be set directly.
Definition: input.hpp:594
friend iterator
Befriend iterator so it can access the buffers.
Definition: input.hpp:673
std::vector< char > stream_buffer
A larger (compared to stl default) stream buffer to use when reading from a file.
Definition: input.hpp:604
valid_formats_ valid_formats
A seqan3::type_list with the possible formats.
Definition: input.hpp:328
typename traits_type::template sequence_container< qualified< typename traits_type::sequence_alphabet, typename traits_type::quality_alphabet > > sequence_quality_type
The type of field::seq_qual (std::vector <seqan3::dna5q> by default).
Definition: input.hpp:373
bool at_end
File is at position 1 behind the last record.
Definition: input.hpp:626
stream_ptr_t secondary_stream
The secondary stream is a compression layer on the primary or just points to the primary (no compress...
Definition: input.hpp:621
sequence_file_input(stream_type &&stream, file_format const &, selected_field_ids const &) -> sequence_file_input< typename sequence_file_input<>::traits_type, selected_field_ids, type_list< file_format >>
Deduces the sequence input file type from the stream, the format and the field ids.
void read_next_record()
Tell the format to move to the next record and update the buffer.
Definition: input.hpp:636
fields< field::seq, field::id, field::qual, field::seq_qual > field_ids
The subset of seqan3::field IDs that are valid for this file; order corresponds to the types in field...
Definition: input.hpp:336
sequence_record< detail::select_types_with_ids_t< field_types, field_ids, selected_field_ids >, selected_field_ids > record_type
The type of the record, a specialisation of seqan3::record; acts as a tuple of the selected field typ...
Definition: input.hpp:382
T data(T... args)
Provides auxiliary data structures and functions for seqan3::record and seqan3::fields.
Provides seqan3::dna15, container aliases and string literals.
Provides seqan3::dna5, container aliases and string literals.
This header includes C++17 filesystem support and imports it into namespace std::filesystem (independ...
Provides the seqan3::sequence_file_format_genbank class.
T get(T... args)
SEQAN3_CONCEPT type_list_of_sequence_file_input_formats
Auxiliary concept that checks whether a type is a seqan3::type_list and all types meet seqan3::sequen...
Definition: input_format_concept.hpp:155
field
An enumerator for the fields used in file formats.
Definition: record.hpp:62
void set_format(format_variant_type &format, std::filesystem::path const &file_name)
Sets the file format according to the file name extension.
Definition: misc.hpp:67
@ seq_qual
Sequence and qualities combined in one range.
@ seq
The "sequence", usually a range of nucleotides or amino acids.
@ qual
The qualities, usually in Phred score notation.
meta::list< types... > type_list
Type that contains multiple types, an alias for meta::list.
Definition: type_list.hpp:31
constexpr bool contains
Whether a type occurs in a type list or not.
Definition: traits.hpp:194
auto const move
A view that turns lvalue-references into rvalue-references.
Definition: move.hpp:70
Provides the seqan3::detail::in_file_iterator class template.
Resolves to std::ranges::explicitly_convertible_to<type1, type2>(). <dl class="no-api">This entity i...
A more refined container concept than seqan3::container.
The generic concept for sequence file in formats.
The requirements a traits_type for seqan3::sequence_file_input must meet.
Refines seqan3::alphabet and adds assignability.
A concept that indicates whether a writable alphabet represents quality scores.
Provides exceptions used in the I/O module.
Stream concepts.
Provides various utility functions required only for input.
auto make_secondary_istream(std::basic_istream< char_t > &primary_stream, std::filesystem::path &filename) -> std::unique_ptr< std::basic_istream< char_t >, std::function< void(std::basic_istream< char_t > *)>>
Depending on the magic bytes of the given stream, return a decompression stream or forward the primar...
Definition: misc_input.hpp:79
The main SeqAn3 namespace.
Definition: aligned_sequence_concept.hpp:29
Provides algorithms for meta programming, parameter packs and seqan3::type_list.
Provides seqan3::phred42 quality scores.
Provides quality alphabet composites.
Provides the seqan3::format_sam.
Provides seqan3::sequence_file_input_format and auxiliary classes.
Provides seqan3::sequence_record.
T size(T... args)
Internal class used to expose the actual format interface to read sequence records from the file.
Definition: input_format_concept.hpp:41
Base class to deduce the std::variant type from format tags.
Definition: misc.hpp:30
A class template that holds a choice of seqan3::field.
Definition: record.hpp:163
static constexpr bool contains(field f)
Whether a field is contained in the parameter pack.
Definition: record.hpp:181
void clear() noexcept(noexcept(std::apply(expander, std::declval< record & >())))
Clears containers that provide .clear() and (re-)initialises all other elements with = {}.
Definition: record.hpp:267
A traits type that specifies input as amino acids.
Definition: input.hpp:170
The default traits for seqan3::sequence_file_input.
Definition: input.hpp:134
char id_alphabet
The alphabet for an identifier string is char.
Definition: input.hpp:151
The options type defines various option members that influence the behaviour of all or some formats.
Definition: input_options.hpp:26
Provides traits for seqan3::type_list.
T visit(T... args)