Skip to content

Commit

Permalink
Improve docs
Browse files Browse the repository at this point in the history
  • Loading branch information
wang-q committed Feb 19, 2025
1 parent ba05ac6 commit ad2ce01
Show file tree
Hide file tree
Showing 5 changed files with 83 additions and 31 deletions.
16 changes: 10 additions & 6 deletions src/cmd/one.rs
Original file line number Diff line number Diff line change
Expand Up @@ -6,13 +6,18 @@ pub fn make_subcommand() -> Command {
.about("Extract one FA record by name")
.after_help(
r###"
This command extracts a single FASTA record from an input file based on the provided sequence name.
This command extracts a single FA record from an input file based on the provided sequence name.
Notes:
* Case-sensitive name matching
* Stops after finding the first match
* Supports both plain text and gzipped (.gz) files
Examples:
1. Extract a record by name and write to stdout:
1. Extract a record by name:
hnsm one input.fa seq1
2. Extract a record by name and save to a file:
2. Save to a file:
hnsm one input.fa seq1 -o output.fa
"###,
Expand Down Expand Up @@ -55,13 +60,12 @@ pub fn execute(args: &ArgMatches) -> anyhow::Result<()> {
let name = args.get_one::<String>("name").unwrap();

//----------------------------
// Ops
// Process
//----------------------------
for result in fa_in.records() {
// obtain record or fail with error
let record = result?;
let this_name = String::from_utf8(record.name().into())?;

let this_name = String::from_utf8(record.name().into()).unwrap();
if this_name == *name {
fa_out.write_record(&record)?;
break;
Expand Down
34 changes: 21 additions & 13 deletions src/cmd/order.rs
Original file line number Diff line number Diff line change
Expand Up @@ -8,15 +8,22 @@ pub fn make_subcommand() -> Command {
.after_help(
r###"
This command extracts FA records from an input file in the order specified by a list of sequence names.
All sequences are loaded into memory, so this command may consume significant memory for large files.
Examples:
1. Extract sequences in the order specified by list.txt:
hnsm order input.fa list.txt -o output.fa
Notes:
* Case-sensitive name matching
* One sequence name per line in the list file
* Empty lines and lines starting with '#' are ignored
* All sequences are loaded into memory
* Supports both plain text and gzipped (.gz) files
* Missing sequences in the input file are silently skipped
2. Output to stdout:
Examples:
1. Extract sequences in order specified by list.txt:
hnsm order input.fa list.txt
2. Process gzipped files:
hnsm order input.fa.gz list.txt -o output.fa.gz
"###,
)
.arg(
Expand Down Expand Up @@ -54,27 +61,28 @@ pub fn execute(args: &ArgMatches) -> anyhow::Result<()> {
.set_line_base_count(usize::MAX)
.build_from_writer(writer);

let vec_list = intspan::read_first_column(args.get_one::<String>("list.txt").unwrap());
let list: indexmap::IndexSet<_> =
intspan::read_first_column(args.get_one::<String>("list.txt").unwrap())
.into_iter()
.collect();

//----------------------------
// Ops
// Process
//----------------------------
// Load records into a BTreeMap for efficient lookup
let mut record_of = BTreeMap::new();

for result in fa_in.records() {
// obtain record or fail with error
let record = result?;
let name = String::from_utf8(record.name().into())?;

let name = String::from_utf8(record.name().into()).unwrap();
if vec_list.contains(&name) {
if list.contains(&name) {
record_of.insert(name, record);
}
}

for el in vec_list.iter() {
if record_of.contains_key(el) {
let record = record_of.get(el).unwrap();
for name in list.iter() {
if let Some(record) = record_of.get(name) {
fa_out.write_record(record)?;
}
}
Expand Down
21 changes: 14 additions & 7 deletions src/cmd/some.rs
Original file line number Diff line number Diff line change
Expand Up @@ -8,17 +8,22 @@ pub fn make_subcommand() -> Command {
.after_help(
r###"
This command extracts FASTA records from an input file based on a list of sequence names.
It can also invert the selection to output sequences not in the list.
Notes:
* Case-sensitive name matching
* One sequence name per line in the list file
* Empty lines and lines starting with '#' are ignored
* Supports both plain text and gzipped (.gz) files
Examples:
1. Extract sequences listed in list.txt:
hnsm some input.fa list.txt
2. Extract sequences NOT listed in list.txt (invert selection):
2. Extract sequences NOT in list.txt:
hnsm some input.fa list.txt -i
3. Save the output to a file:
hnsm some input.fa list.txt -o output.fa
3. Process gzipped files:
hnsm some input.fa.gz list.txt -o output.fa.gz
"###,
)
Expand Down Expand Up @@ -67,18 +72,20 @@ pub fn execute(args: &ArgMatches) -> anyhow::Result<()> {
.build_from_writer(writer);

//----------------------------
// Ops
// Load list
//----------------------------
let set_list: HashSet<String> =
intspan::read_first_column(args.get_one::<String>("list.txt").unwrap())
.into_iter()
.collect();

//----------------------------
// Process
//----------------------------
for result in fa_in.records() {
// obtain record or fail with error
let record = result?;

let name = String::from_utf8(record.name().into())?;

if set_list.contains(&name) != is_invert {
fa_out.write_record(&record)?;
}
Expand Down
30 changes: 26 additions & 4 deletions src/cmd/split.rs
Original file line number Diff line number Diff line change
Expand Up @@ -8,11 +8,33 @@ pub fn make_subcommand() -> Command {
.about("Split FA file(s) into several files")
.after_help(
r#"
Modes
Split FASTA files into multiple smaller files based on different modes:
1. name: Create separate files for each sequence
* Uses sequence names as filenames (sanitized)
* Special characters ()/: are replaced with _
2. about: Split by approximate size
* -c SIZE: Split into files of about SIZE bytes each
* -e: Ensure even number of sequences per file
* -m NUM: Maximum number of output files (default: 999)
Notes:
* Supports both plain text and gzipped (.gz) files
* Output files are named as xxx.fa
* For 'name' mode, filenames are sanitized
* For 'about' mode, files are zero-padded numbered
Examples:
1. Split by sequence names:
hnsm split name input.fa -o output_dir
2. Split into ~1MB files:
hnsm split about input.fa -c 1000000 -o output_dir
3. Split with even sequences:
hnsm split about input.fa -c 1000000 -e -o output_dir
* name - using sequence names as file names
* about - about `count` bytes each by record
* -c, -e, -m
"#,
)
Expand Down
13 changes: 12 additions & 1 deletion src/libs/nt.rs
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@
/// ```
#[allow(dead_code)]
#[repr(usize)]
#[derive(Clone, Copy)]
#[derive(Clone, Copy, Debug, PartialEq)]
pub enum Nt {
A = 0,
C = 1,
Expand All @@ -33,6 +33,17 @@ pub enum Nt {
#[allow(dead_code)]
impl Nt {
pub const U: Nt = Nt::T;

/// Convert a nucleotide character to its corresponding Nt variant
pub fn from_char(c: char) -> Self {
match c.to_ascii_uppercase() {
'A' => Nt::A,
'C' => Nt::C,
'G' => Nt::G,
'T' | 'U' => Nt::T,
_ => Nt::Invalid,
}
}
}

/// Maps an ASCII chars to index
Expand Down

0 comments on commit ad2ce01

Please sign in to comment.