Skip to content

Commit

Permalink
Improve docs
Browse files Browse the repository at this point in the history
  • Loading branch information
wang-q committed Feb 19, 2025
1 parent ad2ce01 commit 9aa635c
Show file tree
Hide file tree
Showing 6 changed files with 144 additions and 112 deletions.
43 changes: 28 additions & 15 deletions src/cmd/dedup.rs
Original file line number Diff line number Diff line change
Expand Up @@ -7,10 +7,25 @@ pub fn make_subcommand() -> Command {
.about("Deduplicate records in FA file(s)")
.after_help(
r###"
This command removes duplicate records from one or more FA files based on name, description, or sequence.
This command removes duplicate records from FA files.
* The default behavior is the same as `hnsm filter -u`
* By default, only the forward strand is compared, setting `-b` compares both strands
Deduplication modes:
* By name (default): Compare sequence names only
* By description (-d): Compare full headers (name + description)
* By sequence (-s): Compare sequence contents
Comparison options:
* -b: Compare both strands (forward and reverse complement)
* -c: Case-insensitive comparison
Output options:
* -f FILE: Save duplicated entries mapping to FILE
* Format: original_name duplicate_name
Notes:
* First occurrence is kept, others removed
* Supports both plain text and gzipped (.gz) files
* -b implies case-insensitive comparison for sequences
sequence name
| |
Expand All @@ -19,20 +34,17 @@ This command removes duplicate records from one or more FA files based on name,
description
Examples:
1. Deduplicate by name (default):
hnsm dedup input.fa
1. Basic deduplication by name:
hnsm dedup input.fa -o output.fa
2. Deduplicate by sequence:
hnsm dedup input.fa -s
2. By sequence content:
hnsm dedup input.fa -s -o output.fa
3. Deduplicate by name and description:
hnsm dedup input.fa -d
3. Compare both strands:
hnsm dedup input.fa -s -b -o output.fa
4. Compare both strands:
hnsm dedup input.fa -b
5. Save duplicated names to a file:
hnsm dedup input.fa -f duplicates.txt
4. Save duplicates mapping:
hnsm dedup input.fa -f dups.tsv -o output.fa
"###,
)
Expand Down Expand Up @@ -104,9 +116,10 @@ pub fn execute(args: &ArgMatches) -> anyhow::Result<()> {
.build_from_writer(writer);

//----------------------------
// Ops
// Process
//----------------------------
let mut subject_map: HashMap<u64, Vec<String>> = HashMap::new();

for infile in args.get_many::<String>("infiles").unwrap() {
let reader = intspan::reader(infile);
let mut fa_in = noodles_fasta::io::Reader::new(reader);
Expand Down
59 changes: 27 additions & 32 deletions src/cmd/filter.rs
Original file line number Diff line number Diff line change
Expand Up @@ -4,45 +4,40 @@ use std::collections::BTreeSet;
// Create clap subcommand arguments
pub fn make_subcommand() -> Command {
Command::new("filter")
.about("Filter records in FA file(s)")
.about("Filter and format sequences in FA file(s)")
.after_help(
r###"
This command filters records in one or more FASTA files based on various criteria.
It can filter by sequence length, number of Ns, and more. It also supports formatting options.
This command filters and formats sequences in FA files.
Filters:
* --minsize N: Keep sequences >= N bp
* --maxsize N: Keep sequences <= N bp
* --maxn N: Keep sequences with < N ambiguous bases
* --uniq: Remove duplicate sequence IDs
Formatters:
* --upper: Convert sequences to uppercase
* --iupac: Convert ambiguous codes to 'N'
* --dash: Remove dashes from sequences
* --simplify: Simplify sequence names (truncate at first space/./,/-)
* --line N: Set sequence line length
Notes:
* Multiple filters can be combined
* Supports both plain text and gzipped (.gz) files
* For duplicate IDs, keeps the first occurrence
* Not all faFilter options have been implemented
Wildcards for names can be easily implemented with `hnsm some`
* This subcommand is also a formatter
* -l is used to set the number of bases per line
* -b/--block is not implemented here
Examples:
1. Filter sequences by minimum size:
hnsm filter input.fa --minsize 100
2. Filter sequences by maximum size:
hnsm filter input.fa --maxsize 1000
3. Filter sequences by maximum number of Ns:
hnsm filter input.fa --maxn 10
4. Remove duplicate sequences:
hnsm filter input.fa --uniq
5. Convert sequences to upper case:
hnsm filter input.fa --upper
6. Convert IUPAC ambiguous codes to 'N':
hnsm filter input.fa --iupac
7. Remove dashes from sequences:
hnsm filter input.fa --dash
1. Filter by size:
hnsm filter input.fa --minsize 100 --maxsize 1000
8. Simplify sequence names:
hnsm filter input.fa --simplify
2. Format sequences:
hnsm filter input.fa --upper --iupac --line 80
9. Set sequence line length:
hnsm filter input.fa --line 80
3. Process multiple files:
hnsm filter *.fa --uniq --simplify -o output.fa
"###,
)
Expand Down Expand Up @@ -158,7 +153,7 @@ pub fn execute(args: &ArgMatches) -> anyhow::Result<()> {
.build_from_writer(writer);

//----------------------------
// Ops
// Process
//----------------------------
let mut set_list: BTreeSet<String> = BTreeSet::new();
for infile in args.get_many::<String>("infiles").unwrap() {
Expand Down
52 changes: 30 additions & 22 deletions src/cmd/mask.rs
Original file line number Diff line number Diff line change
Expand Up @@ -3,27 +3,38 @@ use clap::*;
// Create clap subcommand arguments
pub fn make_subcommand() -> Command {
Command::new("mask")
.about("Soft/hard-masking regions in FA file(s)")
.about("Mask regions in FA file(s)")
.after_help(
r###"
This command masks regions in a FA file based on a runlist JSON file. The runlist specifies
regions to be masked, and the masking can be either soft (lowercase) or hard (replace with N).
This command masks specified regions in FASTA sequences.
The runlist JSON file should have the following format:
{
"seq_name": "start1-end1,start2-end2,...",
...
}
Masking modes:
* Soft-masking (default): Convert to lowercase
* Hard-masking (--hard): Replace with N's
Input format (runlist.json):
{
"seq1": "1-100,200-300", # Mask positions 1-100 and 200-300
"seq2": "50-150", # Mask positions 50-150
"seq3": "1-50,90-100,..." # Multiple regions allowed
}
Notes:
* 1-based coordinates
* Inclusive ranges
* Sequences not in runlist remain unchanged
* Supports both plain text and gzipped (.gz) files
* Invalid ranges are silently ignored
Examples:
1. Soft-mask regions specified in runlist.json:
hnsm mask input.fa runlist.json -o masked.fa
1. Soft-mask regions:
hnsm mask input.fa regions.json -o output.fa
2. Hard-mask regions (replace with N):
hnsm mask input.fa runlist.json --hard -o masked.fa
2. Hard-mask regions:
hnsm mask input.fa regions.json --hard -o output.fa
3. Output to stdout:
hnsm mask input.fa runlist.json
3. Process gzipped files:
hnsm mask input.fa.gz regions.json -o output.fa.gz
"###,
)
Expand Down Expand Up @@ -76,12 +87,10 @@ pub fn execute(args: &ArgMatches) -> anyhow::Result<()> {
.build_from_writer(writer);

//----------------------------
// Ops
// Process
//----------------------------
for result in fa_in.records() {
// obtain record or fail with error
let record = result?;

let name = String::from_utf8(record.name().into())?;
let seq = record.sequence();

Expand All @@ -98,12 +107,11 @@ pub fn execute(args: &ArgMatches) -> anyhow::Result<()> {
let offset = (lower - 1) as usize;
let length = (upper - lower + 1) as usize;

let mut str = seq_out[offset..offset + length].to_string();
if is_hard {
str = "N".repeat(length); // Hard-mask with N
let str = if is_hard {
"N".repeat(length)
} else {
str = str.to_lowercase(); // Soft-mask with lowercase
}
seq_out[offset..offset + length].to_lowercase()
};
seq_out.replace_range(offset..offset + length, &str);
}

Expand Down
44 changes: 26 additions & 18 deletions src/cmd/rc.rs
Original file line number Diff line number Diff line change
Expand Up @@ -4,23 +4,31 @@ use std::collections::HashSet;
// Create clap subcommand arguments
pub fn make_subcommand() -> Command {
Command::new("rc")
.about("Reverse complement a FA file")
.about("Reverse complement sequences in FA file(s)")
.after_help(
r###"
This command reverse complements sequences in a FA file. If a list of sequence names is provided,
only the sequences in the list will be reverse complemented. Otherwise, all sequences will be processed.
This command reverse complements DNA sequences in FA files.
By default, reverse complemented sequences will have their names prefixed with "RC_". Use the --consistent
flag to keep the original names.
Features:
* Process all sequences or only selected ones
* Optionally prefix names with 'RC_'
* Handles IUPAC ambiguous codes correctly
* Preserves case (upper/lower) of bases
Notes:
* Case-sensitive name matching when using list
* Empty lines and lines starting with '#' are ignored in list
* Supports both plain text and gzipped (.gz) files
* Non-IUPAC characters are preserved as-is
Examples:
1. Reverse complement all sequences in a FASTA file:
1. Reverse complement all sequences:
hnsm rc input.fa -o output.fa
2. Reverse complement only sequences listed in list.txt:
2. Only process listed sequences:
hnsm rc input.fa list.txt -o output.fa
3. Reverse complement sequences but keep their original names:
3. Keep original names (no 'RC_' prefix):
hnsm rc input.fa -c -o output.fa
"###,
Expand Down Expand Up @@ -69,9 +77,6 @@ pub fn execute(args: &ArgMatches) -> anyhow::Result<()> {
.set_line_base_count(usize::MAX)
.build_from_writer(writer);

//----------------------------
// Ops
//----------------------------
let set_list: HashSet<String> = if args.contains_id("list.txt") {
intspan::read_first_column(args.get_one::<String>("list.txt").unwrap())
.into_iter()
Expand All @@ -80,22 +85,25 @@ pub fn execute(args: &ArgMatches) -> anyhow::Result<()> {
HashSet::new()
};

//----------------------------
// Process
//----------------------------
for result in fa_in.records() {
// obtain record or fail with error
let record = result?;
let mut name = String::from_utf8(record.name().into()).unwrap();
let name = String::from_utf8(record.name().into())?;

if args.contains_id("list.txt") && !set_list.contains(&name) {
fa_out.write_record(&record)?;
continue;
}

if !is_consistent {
name = format!("RC_{}", name);
}

let definition = noodles_fasta::record::Definition::new(&*name, None);
let new_name = if is_consistent {
name
} else {
format!("RC_{}", name)
};

let definition = noodles_fasta::record::Definition::new(&*new_name, None);
let seq_rc: noodles_fasta::record::Sequence = record
.sequence()
.complement()
Expand Down
19 changes: 7 additions & 12 deletions src/cmd/replace.rs
Original file line number Diff line number Diff line change
Expand Up @@ -13,9 +13,9 @@ If more than two columns are provided, the sequence will be duplicated for each
Multiple lines of the same original_name will also duplicate the record.
The TSV file format:
original_name replace_name more_replace_name
original_name replace_name
original_name another_replace_name
seq1 replace_name more_replace_name
seq2 replace_name
seq2 another_replace_name
Examples:
1. Replace headers using a TSV file:
Expand Down Expand Up @@ -72,25 +72,20 @@ pub fn execute(args: &ArgMatches) -> anyhow::Result<()> {
.build_from_writer(writer);

//----------------------------
// Ops
// Process
//----------------------------
for result in fa_in.records() {
// obtain record or fail with error
let record = result?;
let name = String::from_utf8(record.name().into())?;

if replace_of.contains_key(&name) {
for el in replace_of.get(&name).unwrap() {
if let Some(new_names) = replace_of.get(&name) {
for el in new_names {
let definition = noodles_fasta::record::Definition::new(&**el, None);
let record_replace =
noodles_fasta::Record::new(definition, record.sequence().clone());
// output the replaced record
fa_out.write_record(&record_replace)?;
}
} else if is_some {
continue;
} else {
// output the original record
} else if !is_some {
fa_out.write_record(&record)?;
}
}
Expand Down
Loading

0 comments on commit 9aa635c

Please sign in to comment.