Improve docs

wang-q · Feb 19, 2025 · ad2ce01 · ad2ce01
1 parent ba05ac6
commit ad2ce01
Show file tree

Hide file tree

Showing 5 changed files with 83 additions and 31 deletions.
diff --git a/src/cmd/one.rs b/src/cmd/one.rs
@@ -6,13 +6,18 @@ pub fn make_subcommand() -> Command {
         .about("Extract one FA record by name")
         .after_help(
             r###"
-This command extracts a single FASTA record from an input file based on the provided sequence name.
+This command extracts a single FA record from an input file based on the provided sequence name.
+
+Notes:
+* Case-sensitive name matching
+* Stops after finding the first match
+* Supports both plain text and gzipped (.gz) files
 
 Examples:
-1. Extract a record by name and write to stdout:
+1. Extract a record by name:
    hnsm one input.fa seq1
 
-2. Extract a record by name and save to a file:
+2. Save to a file:
    hnsm one input.fa seq1 -o output.fa
 
 "###,
@@ -55,13 +60,12 @@ pub fn execute(args: &ArgMatches) -> anyhow::Result<()> {
     let name = args.get_one::<String>("name").unwrap();
 
     //----------------------------
-    // Ops
+    // Process
     //----------------------------
     for result in fa_in.records() {
-        // obtain record or fail with error
         let record = result?;
+        let this_name = String::from_utf8(record.name().into())?;
 
-        let this_name = String::from_utf8(record.name().into()).unwrap();
         if this_name == *name {
             fa_out.write_record(&record)?;
             break;

diff --git a/src/cmd/order.rs b/src/cmd/order.rs
@@ -8,15 +8,22 @@ pub fn make_subcommand() -> Command {
         .after_help(
             r###"
 This command extracts FA records from an input file in the order specified by a list of sequence names.
-All sequences are loaded into memory, so this command may consume significant memory for large files.
 
-Examples:
-1. Extract sequences in the order specified by list.txt:
-   hnsm order input.fa list.txt -o output.fa
+Notes:
+* Case-sensitive name matching
+* One sequence name per line in the list file
+* Empty lines and lines starting with '#' are ignored
+* All sequences are loaded into memory
+* Supports both plain text and gzipped (.gz) files
+* Missing sequences in the input file are silently skipped
 
-2. Output to stdout:
+Examples:
+1. Extract sequences in order specified by list.txt:
    hnsm order input.fa list.txt
 
+2. Process gzipped files:
+   hnsm order input.fa.gz list.txt -o output.fa.gz
+
 "###,
         )
         .arg(
@@ -54,27 +61,28 @@ pub fn execute(args: &ArgMatches) -> anyhow::Result<()> {
         .set_line_base_count(usize::MAX)
         .build_from_writer(writer);
 
-    let vec_list = intspan::read_first_column(args.get_one::<String>("list.txt").unwrap());
+    let list: indexmap::IndexSet<_> =
+        intspan::read_first_column(args.get_one::<String>("list.txt").unwrap())
+            .into_iter()
+            .collect();
 
     //----------------------------
-    // Ops
+    // Process
     //----------------------------
     // Load records into a BTreeMap for efficient lookup
     let mut record_of = BTreeMap::new();
 
     for result in fa_in.records() {
-        // obtain record or fail with error
         let record = result?;
+        let name = String::from_utf8(record.name().into())?;
 
-        let name = String::from_utf8(record.name().into()).unwrap();
-        if vec_list.contains(&name) {
+        if list.contains(&name) {
             record_of.insert(name, record);
         }
     }
 
-    for el in vec_list.iter() {
-        if record_of.contains_key(el) {
-            let record = record_of.get(el).unwrap();
+    for name in list.iter() {
+        if let Some(record) = record_of.get(name) {
             fa_out.write_record(record)?;
         }
     }

diff --git a/src/cmd/some.rs b/src/cmd/some.rs
@@ -8,17 +8,22 @@ pub fn make_subcommand() -> Command {
         .after_help(
             r###"
 This command extracts FASTA records from an input file based on a list of sequence names.
-It can also invert the selection to output sequences not in the list.
+
+Notes:
+* Case-sensitive name matching
+* One sequence name per line in the list file
+* Empty lines and lines starting with '#' are ignored
+* Supports both plain text and gzipped (.gz) files
 
 Examples:
 1. Extract sequences listed in list.txt:
    hnsm some input.fa list.txt
 
-2. Extract sequences NOT listed in list.txt (invert selection):
+2. Extract sequences NOT in list.txt:
    hnsm some input.fa list.txt -i
 
-3. Save the output to a file:
-   hnsm some input.fa list.txt -o output.fa
+3. Process gzipped files:
+   hnsm some input.fa.gz list.txt -o output.fa.gz
 
 "###,
         )
@@ -67,18 +72,20 @@ pub fn execute(args: &ArgMatches) -> anyhow::Result<()> {
         .build_from_writer(writer);
 
     //----------------------------
-    // Ops
+    // Load list
     //----------------------------
     let set_list: HashSet<String> =
         intspan::read_first_column(args.get_one::<String>("list.txt").unwrap())
             .into_iter()
             .collect();
 
+    //----------------------------
+    // Process
+    //----------------------------
     for result in fa_in.records() {
-        // obtain record or fail with error
         let record = result?;
-
         let name = String::from_utf8(record.name().into())?;
+
         if set_list.contains(&name) != is_invert {
             fa_out.write_record(&record)?;
         }

diff --git a/src/cmd/split.rs b/src/cmd/split.rs
@@ -8,11 +8,33 @@ pub fn make_subcommand() -> Command {
         .about("Split FA file(s) into several files")
         .after_help(
             r#"
-Modes
+Split FASTA files into multiple smaller files based on different modes:
+
+1. name: Create separate files for each sequence
+   * Uses sequence names as filenames (sanitized)
+   * Special characters ()/: are replaced with _
+
+2. about: Split by approximate size
+   * -c SIZE: Split into files of about SIZE bytes each
+   * -e: Ensure even number of sequences per file
+   * -m NUM: Maximum number of output files (default: 999)
+
+Notes:
+* Supports both plain text and gzipped (.gz) files
+* Output files are named as xxx.fa
+* For 'name' mode, filenames are sanitized
+* For 'about' mode, files are zero-padded numbered
+
+Examples:
+1. Split by sequence names:
+   hnsm split name input.fa -o output_dir
+
+2. Split into ~1MB files:
+   hnsm split about input.fa -c 1000000 -o output_dir
+
+3. Split with even sequences:
+   hnsm split about input.fa -c 1000000 -e -o output_dir
 
-* name  - using sequence names as file names
-* about - about `count` bytes each by record
-    * -c, -e, -m
 
 "#,
         )

diff --git a/src/libs/nt.rs b/src/libs/nt.rs
@@ -20,7 +20,7 @@
 /// ```
 #[allow(dead_code)]
 #[repr(usize)]
-#[derive(Clone, Copy)]
+#[derive(Clone, Copy, Debug, PartialEq)]
 pub enum Nt {
     A = 0,
     C = 1,
@@ -33,6 +33,17 @@ pub enum Nt {
 #[allow(dead_code)]
 impl Nt {
     pub const U: Nt = Nt::T;
+
+    /// Convert a nucleotide character to its corresponding Nt variant
+    pub fn from_char(c: char) -> Self {
+        match c.to_ascii_uppercase() {
+            'A' => Nt::A,
+            'C' => Nt::C,
+            'G' => Nt::G,
+            'T' | 'U' => Nt::T,
+            _ => Nt::Invalid,
+        }
+    }
 }
 
 /// Maps an ASCII chars to index