Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[FIX] Write BAM header on sam_file deconstruction if not written before #3081

Merged
merged 4 commits into from
Oct 31, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,10 @@ If possible, provide tooling that performs the changes, e.g. a shell-script.

## Notable Bug-fixes

#### I/O
* Empty SAM/BAM files must at least write a header to ensure a valid file
([\#3081](https://github.com/seqan/seqan3/pull/3081)).

## API changes

#### Dependencies
Expand Down
169 changes: 85 additions & 84 deletions include/seqan3/io/sam_file/detail/format_sam_base.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -98,9 +98,8 @@ class format_sam_base
sam_file_header<ref_ids_type> & hdr,
ref_seqs_type & /*ref_id_to_pos_map*/);

template <typename stream_t, typename ref_ids_type>
void
write_header(stream_t & stream, sam_file_output_options const & options, sam_file_header<ref_ids_type> & header);
template <typename stream_t, typename header_type>
void write_header(stream_t & stream, sam_file_output_options const & options, header_type & header);
};

/*!\brief Checks for known reference ids or adds a new reference is and assigns a reference id to `ref_id`.
Expand Down Expand Up @@ -702,115 +701,117 @@ inline void format_sam_base::read_header(stream_view_type && stream_view,
* according to the rules of the official
* [SAM format specifications](https://samtools.github.io/hts-specs/SAMv1.pdf).
*/
template <typename stream_t, typename ref_ids_type>
inline void format_sam_base::write_header(stream_t & stream,
sam_file_output_options const & options,
sam_file_header<ref_ids_type> & header)
template <typename stream_t, typename header_type>
inline void
format_sam_base::write_header(stream_t & stream, sam_file_output_options const & options, header_type & header)
{
// -----------------------------------------------------------------
// Check Header
// -----------------------------------------------------------------
if constexpr (!detail::decays_to_ignore_v<header_type>)
{
// -----------------------------------------------------------------
// Check Header
// -----------------------------------------------------------------

// (@HD) Check header line
// The format version string will be taken from the local member variable
if (!header.sorting.empty()
&& !(header.sorting == "unknown" || header.sorting == "unsorted" || header.sorting == "queryname"
|| header.sorting == "coordinate"))
throw format_error{"SAM format error: The header.sorting member must be "
"one of [unknown, unsorted, queryname, coordinate]."};
// (@HD) Check header line
// The format version string will be taken from the local member variable
if (!header.sorting.empty()
&& !(header.sorting == "unknown" || header.sorting == "unsorted" || header.sorting == "queryname"
|| header.sorting == "coordinate"))
throw format_error{"SAM format error: The header.sorting member must be "
"one of [unknown, unsorted, queryname, coordinate]."};

if (!header.grouping.empty()
&& !(header.grouping == "none" || header.grouping == "query" || header.grouping == "reference"))
throw format_error{"SAM format error: The header.grouping member must be "
"one of [none, query, reference]."};
if (!header.grouping.empty()
&& !(header.grouping == "none" || header.grouping == "query" || header.grouping == "reference"))
throw format_error{"SAM format error: The header.grouping member must be "
"one of [none, query, reference]."};

// (@SQ) Check Reference Sequence Dictionary lines
// (@SQ) Check Reference Sequence Dictionary lines

// TODO
// TODO

// - sorting order be one of ...
// - grouping can be one of ...
// - reference names must be unique
// - ids of read groups must be unique
// - program ids need to be unique
// many more small semantic things, like fits REGEX
// - sorting order be one of ...
// - grouping can be one of ...
// - reference names must be unique
// - ids of read groups must be unique
// - program ids need to be unique
// many more small semantic things, like fits REGEX

// -----------------------------------------------------------------
// Write Header
// -----------------------------------------------------------------
std::ostreambuf_iterator stream_it{stream};
// -----------------------------------------------------------------
// Write Header
// -----------------------------------------------------------------
std::ostreambuf_iterator stream_it{stream};

// (@HD) Write header line [required].
stream << "@HD\tVN:";
std::ranges::copy(format_version, stream_it);
// (@HD) Write header line [required].
stream << "@HD\tVN:";
std::ranges::copy(format_version, stream_it);

if (!header.sorting.empty())
stream << "\tSO:" << header.sorting;
if (!header.sorting.empty())
stream << "\tSO:" << header.sorting;

if (!header.subsorting.empty())
stream << "\tSS:" << header.subsorting;
if (!header.subsorting.empty())
stream << "\tSS:" << header.subsorting;

if (!header.grouping.empty())
stream << "\tGO:" << header.grouping;
if (!header.grouping.empty())
stream << "\tGO:" << header.grouping;

detail::write_eol(stream_it, options.add_carriage_return);
detail::write_eol(stream_it, options.add_carriage_return);

// (@SQ) Write Reference Sequence Dictionary lines [required].
for (auto const & [ref_name, ref_info] : views::zip(header.ref_ids(), header.ref_id_info))
{
stream << "@SQ\tSN:";
// (@SQ) Write Reference Sequence Dictionary lines [required].
for (auto const & [ref_name, ref_info] : views::zip(header.ref_ids(), header.ref_id_info))
{
stream << "@SQ\tSN:";

std::ranges::copy(ref_name, stream_it);
std::ranges::copy(ref_name, stream_it);

stream << "\tLN:" << get<0>(ref_info);
stream << "\tLN:" << get<0>(ref_info);

if (!get<1>(ref_info).empty())
stream << "\t" << get<1>(ref_info);
if (!get<1>(ref_info).empty())
stream << "\t" << get<1>(ref_info);

detail::write_eol(stream_it, options.add_carriage_return);
}
detail::write_eol(stream_it, options.add_carriage_return);
}

// Write read group (@RG) lines if specified.
for (auto const & read_group : header.read_groups)
{
stream << "@RG"
<< "\tID:" << get<0>(read_group);
// Write read group (@RG) lines if specified.
for (auto const & read_group : header.read_groups)
{
stream << "@RG"
<< "\tID:" << get<0>(read_group);

if (!get<1>(read_group).empty())
stream << "\t" << get<1>(read_group);
if (!get<1>(read_group).empty())
stream << "\t" << get<1>(read_group);

detail::write_eol(stream_it, options.add_carriage_return);
}
detail::write_eol(stream_it, options.add_carriage_return);
}

// Write program (@PG) lines if specified.
for (auto const & program : header.program_infos)
{
stream << "@PG"
<< "\tID:" << program.id;
// Write program (@PG) lines if specified.
for (auto const & program : header.program_infos)
{
stream << "@PG"
<< "\tID:" << program.id;

if (!program.name.empty())
stream << "\tPN:" << program.name;
if (!program.name.empty())
stream << "\tPN:" << program.name;

if (!program.command_line_call.empty())
stream << "\tCL:" << program.command_line_call;
if (!program.command_line_call.empty())
stream << "\tCL:" << program.command_line_call;

if (!program.previous.empty())
stream << "\tPP:" << program.previous;
if (!program.previous.empty())
stream << "\tPP:" << program.previous;

if (!program.description.empty())
stream << "\tDS:" << program.description;
if (!program.description.empty())
stream << "\tDS:" << program.description;

if (!program.version.empty())
stream << "\tVN:" << program.version;
if (!program.version.empty())
stream << "\tVN:" << program.version;

detail::write_eol(stream_it, options.add_carriage_return);
}
detail::write_eol(stream_it, options.add_carriage_return);
}

// Write comment (@CO) lines if specified.
for (auto const & comment : header.comments)
{
stream << "@CO\t" << comment;
detail::write_eol(stream_it, options.add_carriage_return);
// Write comment (@CO) lines if specified.
for (auto const & comment : header.comments)
{
stream << "@CO\t" << comment;
detail::write_eol(stream_it, options.add_carriage_return);
}
}
}

Expand Down
84 changes: 60 additions & 24 deletions include/seqan3/io/sam_file/format_bam.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -137,6 +137,10 @@ class format_bam : private detail::format_sam_base
[[maybe_unused]] double SEQAN3_DOXYGEN_ONLY(e_value),
[[maybe_unused]] double SEQAN3_DOXYGEN_ONLY(bit_score));

//!\privatesection
template <typename stream_t, typename header_type>
void write_header(stream_t & stream, sam_file_output_options const & options, header_type & header);

private:
//!\brief A variable that tracks whether the content of header has been read or not.
bool header_was_read{false};
Expand Down Expand Up @@ -726,8 +730,9 @@ inline void format_bam::write_alignment_record([[maybe_unused]] stream_type & st
if constexpr (detail::decays_to_ignore_v<header_type>)
{
throw format_error{"BAM can only be written with a header but you did not provide enough information! "
"You can either construct the output file with ref_ids and ref_seqs information and "
"the header will be created for you, or you can access the `header` member directly."};
"You can either construct the output file with reference names and reference length "
"information and the header will be created for you, or you can access the `header` member "
"directly."};
}
else
{
Expand All @@ -745,28 +750,7 @@ inline void format_bam::write_alignment_record([[maybe_unused]] stream_type & st
// ---------------------------------------------------------------------
if (!header_was_written)
{
stream << "BAM\1";
std::ostringstream os;
write_header(os, options, header); // write SAM header to temporary stream to query the size.
int32_t l_text{static_cast<int32_t>(os.str().size())};
std::ranges::copy_n(reinterpret_cast<char *>(&l_text), 4, stream_it); // write read id

stream << os.str();

int32_t n_ref{static_cast<int32_t>(header.ref_ids().size())};
std::ranges::copy_n(reinterpret_cast<char *>(&n_ref), 4, stream_it); // write read id

for (int32_t ridx = 0; ridx < n_ref; ++ridx)
{
int32_t l_name{static_cast<int32_t>(header.ref_ids()[ridx].size()) + 1}; // plus null character
std::ranges::copy_n(reinterpret_cast<char *>(&l_name), 4, stream_it); // write l_name
// write reference name:
std::ranges::copy(header.ref_ids()[ridx].begin(), header.ref_ids()[ridx].end(), stream_it);
stream_it = '\0';
// write reference sequence length:
std::ranges::copy_n(reinterpret_cast<char *>(&get<0>(header.ref_id_info[ridx])), 4, stream_it);
}

write_header(stream, options, header);
header_was_written = true;
}

Expand Down Expand Up @@ -961,6 +945,58 @@ inline void format_bam::write_alignment_record([[maybe_unused]] stream_type & st
} // if constexpr (!detail::decays_to_ignore_v<header_type>)
}

//!\copydoc seqan3::detail::format_sam_base::write_header
template <typename stream_t, typename header_type>
inline void format_bam::write_header(stream_t & stream, sam_file_output_options const & options, header_type & header)
{
if constexpr (detail::decays_to_ignore_v<header_type>)
{
throw format_error{"BAM can only be written with a header but you did not provide enough information! "
"You can either construct the output file with reference names and reference length "
"information and the header will be created for you, or you can access the `header` member "
"directly."};
}
else
{
detail::fast_ostreambuf_iterator stream_it{*stream.rdbuf()};

std::ranges::copy_n("BAM\1", 4, stream_it); // Do not copy the null terminator

// write SAM header to temporary stream first to query its size.
std::ostringstream os;
detail::format_sam_base::write_header(os, options, header);
#if SEQAN3_COMPILER_IS_GCC && (__GNUC__ == 10)
int32_t const l_text{static_cast<int32_t>(os.str().size())};
#else
int32_t const l_text{static_cast<int32_t>(os.view().size())};
#endif
std::ranges::copy_n(reinterpret_cast<char const *>(&l_text), 4, stream_it); // write text length

#if SEQAN3_COMPILER_IS_GCC && (__GNUC__ == 10)
auto header_view = os.str();
#else
auto header_view = os.view();
#endif
std::ranges::copy(header_view, stream_it);

assert(header.ref_ids().size() < (1ull << 32));
int32_t const n_ref{static_cast<int32_t>(header.ref_ids().size())};
std::ranges::copy_n(reinterpret_cast<char const *>(&n_ref), 4, stream_it); // write number of references

for (int32_t ridx = 0; ridx < n_ref; ++ridx)
{
assert(header.ref_ids()[ridx].size() + 1 < (1ull << 32));
int32_t const l_name{static_cast<int32_t>(header.ref_ids()[ridx].size()) + 1}; // plus null character
std::ranges::copy_n(reinterpret_cast<char const *>(&l_name), 4, stream_it); // write l_name
// write reference name:
std::ranges::copy(header.ref_ids()[ridx], stream_it);
stream_it = '\0'; // ++ is not necessary for ostream_iterator
// write reference sequence length:
std::ranges::copy_n(reinterpret_cast<char *>(&get<0>(header.ref_id_info[ridx])), 4, stream_it);
}
}
}

//!\copydoc seqan3::format_sam::read_sam_dict_vector
template <typename stream_view_type, typename value_type>
inline void format_bam::read_sam_dict_vector(seqan3::detail::sam_tag_variant & variant,
Expand Down
2 changes: 1 addition & 1 deletion include/seqan3/io/sam_file/format_sam.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -113,7 +113,7 @@ namespace seqan3
*
* \remark For a complete overview, take a look at \ref io_sam_file
*/
class format_sam : private detail::format_sam_base
class format_sam : protected detail::format_sam_base
{
public:
/*!\name Constructors, destructor and assignment
Expand Down
28 changes: 25 additions & 3 deletions include/seqan3/io/sam_file/output.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -103,7 +103,8 @@ class sam_file_output
field::header_ptr>;

static_assert(
[]() constexpr {
[]() constexpr
{
for (field f : selected_field_ids::as_array)
if (!field_ids::contains(f))
return false;
Expand Down Expand Up @@ -149,8 +150,24 @@ class sam_file_output
sam_file_output(sam_file_output &&) = default;
//!\brief Move assignment is defaulted.
sam_file_output & operator=(sam_file_output &&) = default;
//!\brief Destructor is defaulted.
~sam_file_output() = default;
//!\brief The destructor will write the header if it has not been written before.
~sam_file_output()
{
if (header_has_been_written)
return;

assert(!format.valueless_by_exception());

std::visit(
[&](auto & f)
{
if constexpr (std::same_as<ref_ids_type, ref_info_not_given>)
f.write_header(*secondary_stream, options, std::ignore);
else
f.write_header(*secondary_stream, options, *header_ptr);
},
format);
}

/*!\brief Construct from filename.
* \param[in] filename Path to the file you wish to open.
Expand Down Expand Up @@ -592,6 +609,9 @@ class sam_file_output

protected:
//!\privatesection
//!\brief This is needed during deconstruction to know whether a header still needs to be written.
bool header_has_been_written{false};

//!\brief A larger (compared to stl default) stream buffer to use when reading from a file.
std::vector<char> stream_buffer{std::vector<char>(1'000'000)};

Expand Down Expand Up @@ -690,6 +710,8 @@ class sam_file_output
}
},
format);

header_has_been_written = true; // when writing a record, the header is written automatically
}

//!\brief Befriend iterator so it can access the buffers.
Expand Down
Loading