Improve docs

wang-q · Feb 19, 2025 · 9aa635c · 9aa635c
1 parent ad2ce01
commit 9aa635c
Show file tree

Hide file tree

Showing 6 changed files with 144 additions and 112 deletions.
diff --git a/src/cmd/dedup.rs b/src/cmd/dedup.rs
@@ -7,10 +7,25 @@ pub fn make_subcommand() -> Command {
         .about("Deduplicate records in FA file(s)")
         .after_help(
             r###"
-This command removes duplicate records from one or more FA files based on name, description, or sequence.
+This command removes duplicate records from FA files.
 
-* The default behavior is the same as `hnsm filter -u`
-* By default, only the forward strand is compared, setting `-b` compares both strands
+Deduplication modes:
+* By name (default): Compare sequence names only
+* By description (-d): Compare full headers (name + description)
+* By sequence (-s): Compare sequence contents
+
+Comparison options:
+* -b: Compare both strands (forward and reverse complement)
+* -c: Case-insensitive comparison
+
+Output options:
+* -f FILE: Save duplicated entries mapping to FILE
+* Format: original_name    duplicate_name
+
+Notes:
+* First occurrence is kept, others removed
+* Supports both plain text and gzipped (.gz) files
+* -b implies case-insensitive comparison for sequences
 
  sequence name
  | |
@@ -19,20 +34,17 @@ This command removes duplicate records from one or more FA files based on name,
      description
 
 Examples:
-1. Deduplicate by name (default):
-   hnsm dedup input.fa
+1. Basic deduplication by name:
+   hnsm dedup input.fa -o output.fa
 
-2. Deduplicate by sequence:
-   hnsm dedup input.fa -s
+2. By sequence content:
+   hnsm dedup input.fa -s -o output.fa
 
-3. Deduplicate by name and description:
-   hnsm dedup input.fa -d
+3. Compare both strands:
+   hnsm dedup input.fa -s -b -o output.fa
 
-4. Compare both strands:
-   hnsm dedup input.fa -b
-
-5. Save duplicated names to a file:
-   hnsm dedup input.fa -f duplicates.txt
+4. Save duplicates mapping:
+   hnsm dedup input.fa -f dups.tsv -o output.fa
 
 "###,
         )
@@ -104,9 +116,10 @@ pub fn execute(args: &ArgMatches) -> anyhow::Result<()> {
         .build_from_writer(writer);
 
     //----------------------------
-    // Ops
+    // Process
     //----------------------------
     let mut subject_map: HashMap<u64, Vec<String>> = HashMap::new();
+
     for infile in args.get_many::<String>("infiles").unwrap() {
         let reader = intspan::reader(infile);
         let mut fa_in = noodles_fasta::io::Reader::new(reader);

diff --git a/src/cmd/filter.rs b/src/cmd/filter.rs
@@ -4,45 +4,40 @@ use std::collections::BTreeSet;
 // Create clap subcommand arguments
 pub fn make_subcommand() -> Command {
     Command::new("filter")
-        .about("Filter records in FA file(s)")
+        .about("Filter and format sequences in FA file(s)")
         .after_help(
             r###"
-This command filters records in one or more FASTA files based on various criteria.
-It can filter by sequence length, number of Ns, and more. It also supports formatting options.
-
+This command filters and formats sequences in FA files.
+
+Filters:
+* --minsize N: Keep sequences >= N bp
+* --maxsize N: Keep sequences <= N bp
+* --maxn N: Keep sequences with < N ambiguous bases
+* --uniq: Remove duplicate sequence IDs
+
+Formatters:
+* --upper: Convert sequences to uppercase
+* --iupac: Convert ambiguous codes to 'N'
+* --dash: Remove dashes from sequences
+* --simplify: Simplify sequence names (truncate at first space/./,/-)
+* --line N: Set sequence line length
+
+Notes:
+* Multiple filters can be combined
+* Supports both plain text and gzipped (.gz) files
+* For duplicate IDs, keeps the first occurrence
 * Not all faFilter options have been implemented
   Wildcards for names can be easily implemented with `hnsm some`
-* This subcommand is also a formatter
-    * -l is used to set the number of bases per line
-    * -b/--block is not implemented here
 
 Examples:
-1. Filter sequences by minimum size:
-   hnsm filter input.fa --minsize 100
-
-2. Filter sequences by maximum size:
-   hnsm filter input.fa --maxsize 1000
-
-3. Filter sequences by maximum number of Ns:
-   hnsm filter input.fa --maxn 10
-
-4. Remove duplicate sequences:
-   hnsm filter input.fa --uniq
-
-5. Convert sequences to upper case:
-   hnsm filter input.fa --upper
-
-6. Convert IUPAC ambiguous codes to 'N':
-   hnsm filter input.fa --iupac
-
-7. Remove dashes from sequences:
-   hnsm filter input.fa --dash
+1. Filter by size:
+   hnsm filter input.fa --minsize 100 --maxsize 1000
 
-8. Simplify sequence names:
-   hnsm filter input.fa --simplify
+2. Format sequences:
+   hnsm filter input.fa --upper --iupac --line 80
 
-9. Set sequence line length:
-   hnsm filter input.fa --line 80
+3. Process multiple files:
+   hnsm filter *.fa --uniq --simplify -o output.fa
 
 "###,
         )
@@ -158,7 +153,7 @@ pub fn execute(args: &ArgMatches) -> anyhow::Result<()> {
         .build_from_writer(writer);
 
     //----------------------------
-    // Ops
+    // Process
     //----------------------------
     let mut set_list: BTreeSet<String> = BTreeSet::new();
     for infile in args.get_many::<String>("infiles").unwrap() {

diff --git a/src/cmd/mask.rs b/src/cmd/mask.rs
@@ -3,27 +3,38 @@ use clap::*;
 // Create clap subcommand arguments
 pub fn make_subcommand() -> Command {
     Command::new("mask")
-        .about("Soft/hard-masking regions in FA file(s)")
+        .about("Mask regions in FA file(s)")
         .after_help(
             r###"
-This command masks regions in a FA file based on a runlist JSON file. The runlist specifies
-regions to be masked, and the masking can be either soft (lowercase) or hard (replace with N).
+This command masks specified regions in FASTA sequences.
 
-The runlist JSON file should have the following format:
-    {
-        "seq_name": "start1-end1,start2-end2,...",
-        ...
-    }
+Masking modes:
+* Soft-masking (default): Convert to lowercase
+* Hard-masking (--hard): Replace with N's
+
+Input format (runlist.json):
+{
+    "seq1": "1-100,200-300",    # Mask positions 1-100 and 200-300
+    "seq2": "50-150",           # Mask positions 50-150
+    "seq3": "1-50,90-100,..."   # Multiple regions allowed
+}
+
+Notes:
+* 1-based coordinates
+* Inclusive ranges
+* Sequences not in runlist remain unchanged
+* Supports both plain text and gzipped (.gz) files
+* Invalid ranges are silently ignored
 
 Examples:
-1. Soft-mask regions specified in runlist.json:
-   hnsm mask input.fa runlist.json -o masked.fa
+1. Soft-mask regions:
+   hnsm mask input.fa regions.json -o output.fa
 
-2. Hard-mask regions (replace with N):
-   hnsm mask input.fa runlist.json --hard -o masked.fa
+2. Hard-mask regions:
+   hnsm mask input.fa regions.json --hard -o output.fa
 
-3. Output to stdout:
-   hnsm mask input.fa runlist.json
+3. Process gzipped files:
+   hnsm mask input.fa.gz regions.json -o output.fa.gz
 
 "###,
         )
@@ -76,12 +87,10 @@ pub fn execute(args: &ArgMatches) -> anyhow::Result<()> {
         .build_from_writer(writer);
 
     //----------------------------
-    // Ops
+    // Process
     //----------------------------
     for result in fa_in.records() {
-        // obtain record or fail with error
         let record = result?;
-
         let name = String::from_utf8(record.name().into())?;
         let seq = record.sequence();
 
@@ -98,12 +107,11 @@ pub fn execute(args: &ArgMatches) -> anyhow::Result<()> {
             let offset = (lower - 1) as usize;
             let length = (upper - lower + 1) as usize;
 
-            let mut str = seq_out[offset..offset + length].to_string();
-            if is_hard {
-                str = "N".repeat(length); // Hard-mask with N
+            let str = if is_hard {
+                "N".repeat(length)
             } else {
-                str = str.to_lowercase(); // Soft-mask with lowercase
-            }
+                seq_out[offset..offset + length].to_lowercase()
+            };
             seq_out.replace_range(offset..offset + length, &str);
         }
 

diff --git a/src/cmd/rc.rs b/src/cmd/rc.rs
@@ -4,23 +4,31 @@ use std::collections::HashSet;
 // Create clap subcommand arguments
 pub fn make_subcommand() -> Command {
     Command::new("rc")
-        .about("Reverse complement a FA file")
+        .about("Reverse complement sequences in FA file(s)")
         .after_help(
             r###"
-This command reverse complements sequences in a FA file. If a list of sequence names is provided,
-only the sequences in the list will be reverse complemented. Otherwise, all sequences will be processed.
+This command reverse complements DNA sequences in FA files.
 
-By default, reverse complemented sequences will have their names prefixed with "RC_". Use the --consistent
-flag to keep the original names.
+Features:
+* Process all sequences or only selected ones
+* Optionally prefix names with 'RC_'
+* Handles IUPAC ambiguous codes correctly
+* Preserves case (upper/lower) of bases
+
+Notes:
+* Case-sensitive name matching when using list
+* Empty lines and lines starting with '#' are ignored in list
+* Supports both plain text and gzipped (.gz) files
+* Non-IUPAC characters are preserved as-is
 
 Examples:
-1. Reverse complement all sequences in a FASTA file:
+1. Reverse complement all sequences:
    hnsm rc input.fa -o output.fa
 
-2. Reverse complement only sequences listed in list.txt:
+2. Only process listed sequences:
    hnsm rc input.fa list.txt -o output.fa
 
-3. Reverse complement sequences but keep their original names:
+3. Keep original names (no 'RC_' prefix):
    hnsm rc input.fa -c -o output.fa
 
 "###,
@@ -69,9 +77,6 @@ pub fn execute(args: &ArgMatches) -> anyhow::Result<()> {
         .set_line_base_count(usize::MAX)
         .build_from_writer(writer);
 
-    //----------------------------
-    // Ops
-    //----------------------------
     let set_list: HashSet<String> = if args.contains_id("list.txt") {
         intspan::read_first_column(args.get_one::<String>("list.txt").unwrap())
             .into_iter()
@@ -80,22 +85,25 @@ pub fn execute(args: &ArgMatches) -> anyhow::Result<()> {
         HashSet::new()
     };
 
+    //----------------------------
+    // Process
+    //----------------------------
     for result in fa_in.records() {
-        // obtain record or fail with error
         let record = result?;
-        let mut name = String::from_utf8(record.name().into()).unwrap();
+        let name = String::from_utf8(record.name().into())?;
 
         if args.contains_id("list.txt") && !set_list.contains(&name) {
             fa_out.write_record(&record)?;
             continue;
         }
 
-        if !is_consistent {
-            name = format!("RC_{}", name);
-        }
-
-        let definition = noodles_fasta::record::Definition::new(&*name, None);
+        let new_name = if is_consistent {
+            name
+        } else {
+            format!("RC_{}", name)
+        };
 
+        let definition = noodles_fasta::record::Definition::new(&*new_name, None);
         let seq_rc: noodles_fasta::record::Sequence = record
             .sequence()
             .complement()

diff --git a/src/cmd/replace.rs b/src/cmd/replace.rs
@@ -13,9 +13,9 @@ If more than two columns are provided, the sequence will be duplicated for each
 Multiple lines of the same original_name will also duplicate the record.
 
 The TSV file format:
-    original_name   replace_name    more_replace_name
-    original_name   replace_name
-    original_name   another_replace_name
+    seq1    replace_name    more_replace_name
+    seq2    replace_name
+    seq2    another_replace_name
 
 Examples:
 1. Replace headers using a TSV file:
@@ -72,25 +72,20 @@ pub fn execute(args: &ArgMatches) -> anyhow::Result<()> {
         .build_from_writer(writer);
 
     //----------------------------
-    // Ops
+    // Process
     //----------------------------
     for result in fa_in.records() {
-        // obtain record or fail with error
         let record = result?;
         let name = String::from_utf8(record.name().into())?;
 
-        if replace_of.contains_key(&name) {
-            for el in replace_of.get(&name).unwrap() {
+        if let Some(new_names) = replace_of.get(&name) {
+            for el in new_names {
                 let definition = noodles_fasta::record::Definition::new(&**el, None);
                 let record_replace =
                     noodles_fasta::Record::new(definition, record.sequence().clone());
-                // output the replaced record
                 fa_out.write_record(&record_replace)?;
             }
-        } else if is_some {
-            continue;
-        } else {
-            // output the original record
+        } else if !is_some {
             fa_out.write_record(&record)?;
         }
     }