Skip to content

Commit 59bc44e

Browse files
committed
head: fix bug with non-terminated files.
Fixes issue #7472. Update to head app when printing all-but-last-n-lines of a file. Code now checks if the last line of the input file is missing a terminating newline character, and if so prints an extra line. This aligns with GNU-head behavior. Also improved performance of this usecase by using an optimized iterator (memchr-iter) for searching through the input file.
1 parent 2e3da88 commit 59bc44e

File tree

2 files changed

+89
-12
lines changed

2 files changed

+89
-12
lines changed

src/uu/head/src/head.rs

+53-11
Original file line numberDiff line numberDiff line change
@@ -3,9 +3,10 @@
33
// For the full copyright and license information, please view the LICENSE
44
// file that was distributed with this source code.
55

6-
// spell-checker:ignore (vars) seekable
6+
// spell-checker:ignore (vars) seekable memrchr
77

88
use clap::{Arg, ArgAction, ArgMatches, Command};
9+
use memchr::memrchr_iter;
910
use std::ffi::OsString;
1011
#[cfg(unix)]
1112
use std::fs::File;
@@ -378,30 +379,52 @@ where
378379

379380
let mut buffer = [0u8; BUF_SIZE];
380381

381-
let mut i = 0u64;
382382
let mut lines = 0u64;
383+
let mut check_last_byte_first_loop = true;
384+
let mut bytes_remaining_to_search = file_size;
383385

384386
loop {
385387
// the casts here are ok, `buffer.len()` should never be above a few k
386-
let bytes_remaining_to_search = file_size - i;
387-
let bytes_to_read_this_loop = bytes_remaining_to_search.min(BUF_SIZE.try_into().unwrap());
388+
let bytes_to_read_this_loop =
389+
bytes_remaining_to_search.min(buffer.len().try_into().unwrap());
388390
let read_start_offset = bytes_remaining_to_search - bytes_to_read_this_loop;
389391
let buffer = &mut buffer[..bytes_to_read_this_loop.try_into().unwrap()];
392+
bytes_remaining_to_search -= bytes_to_read_this_loop;
390393

391394
input.seek(SeekFrom::Start(read_start_offset))?;
392395
input.read_exact(buffer)?;
393-
for byte in buffer.iter().rev() {
394-
if byte == &separator {
395-
lines += 1;
396-
}
396+
397+
// Unfortunately need special handling for the case that the input file doesn't have
398+
// a terminating `separator` character.
399+
// If the input file doesn't end with a `separator` character, add an extra line to our
400+
// `line` counter. In the case that `n` is 0 we need to return here since we've
401+
// obviously found our 0th-line-from-the-end offset.
402+
if check_last_byte_first_loop {
403+
check_last_byte_first_loop = false;
404+
match buffer.last() {
405+
Some(last_byte_of_file) if (last_byte_of_file != &separator) => {
406+
if n == 0 {
407+
input.rewind()?;
408+
return Ok(file_size);
409+
}
410+
assert_eq!(lines, 0);
411+
lines = 1;
412+
}
413+
_ => (),
414+
};
415+
}
416+
417+
for separator_offset in memrchr_iter(separator, &buffer[..]) {
418+
lines += 1;
397419
// if it were just `n`,
398420
if lines == n + 1 {
399421
input.rewind()?;
400-
return Ok(file_size - i);
422+
return Ok(read_start_offset
423+
+ TryInto::<u64>::try_into(separator_offset).unwrap()
424+
+ 1);
401425
}
402-
i += 1;
403426
}
404-
if file_size - i == 0 {
427+
if read_start_offset == 0 {
405428
input.rewind()?;
406429
return Ok(0);
407430
}
@@ -753,4 +776,23 @@ mod tests {
753776
0
754777
);
755778
}
779+
780+
#[test]
781+
fn test_find_nth_line_from_end_non_terminated() {
782+
// Validate the find_nth_line_from_end for files that are not terminated with a final
783+
// newline character.
784+
let input_file = "a\nb";
785+
let mut input = Cursor::new(input_file);
786+
assert_eq!(find_nth_line_from_end(&mut input, 0, b'\n').unwrap(), 3);
787+
assert_eq!(find_nth_line_from_end(&mut input, 1, b'\n').unwrap(), 2);
788+
}
789+
790+
#[test]
791+
fn test_find_nth_line_from_end_empty() {
792+
// Validate the find_nth_line_from_end for files that are empty.
793+
let input_file = "";
794+
let mut input = Cursor::new(input_file);
795+
assert_eq!(find_nth_line_from_end(&mut input, 0, b'\n').unwrap(), 0);
796+
assert_eq!(find_nth_line_from_end(&mut input, 1, b'\n').unwrap(), 0);
797+
}
756798
}

tests/by-util/test_head.rs

+36-1
Original file line numberDiff line numberDiff line change
@@ -148,6 +148,15 @@ fn test_zero_terminated_syntax_2() {
148148
.stdout_is("x\0y");
149149
}
150150

151+
#[test]
152+
fn test_non_terminated_input() {
153+
new_ucmd!()
154+
.args(&["-n", "-1"])
155+
.pipe_in("x\ny")
156+
.succeeds()
157+
.stdout_is("x\n");
158+
}
159+
151160
#[test]
152161
fn test_zero_terminated_negative_lines() {
153162
new_ucmd!()
@@ -448,12 +457,19 @@ fn test_all_but_last_lines_large_file() {
448457
let scene = TestScenario::new(util_name!());
449458
let fixtures = &scene.fixtures;
450459
let seq_20000_file_name = "seq_20000";
460+
let seq_20000_truncated_file_name = "seq_20000_truncated";
451461
let seq_1000_file_name = "seq_1000";
452462
scene
453463
.cmd("seq")
454464
.arg("20000")
455465
.set_stdout(fixtures.make_file(seq_20000_file_name))
456466
.succeeds();
467+
// Create a file the same as seq_20000 except for the final terminating endline.
468+
scene
469+
.ucmd()
470+
.args(&["-c", "-1", seq_20000_file_name])
471+
.set_stdout(fixtures.make_file(seq_20000_truncated_file_name))
472+
.succeeds();
457473
scene
458474
.cmd("seq")
459475
.arg("1000")
@@ -465,7 +481,7 @@ fn test_all_but_last_lines_large_file() {
465481
.ucmd()
466482
.args(&["-n", "-19000", seq_20000_file_name])
467483
.succeeds()
468-
.stdout_only_fixture("seq_1000");
484+
.stdout_only_fixture(seq_1000_file_name);
469485

470486
scene
471487
.ucmd()
@@ -478,6 +494,25 @@ fn test_all_but_last_lines_large_file() {
478494
.args(&["-n", "-20001", seq_20000_file_name])
479495
.succeeds()
480496
.stdout_only_fixture("emptyfile.txt");
497+
498+
// Confirm correct behavior when the input file doesn't end with a newline.
499+
scene
500+
.ucmd()
501+
.args(&["-n", "-19000", seq_20000_truncated_file_name])
502+
.succeeds()
503+
.stdout_only_fixture(seq_1000_file_name);
504+
505+
scene
506+
.ucmd()
507+
.args(&["-n", "-20000", seq_20000_truncated_file_name])
508+
.succeeds()
509+
.stdout_only_fixture("emptyfile.txt");
510+
511+
scene
512+
.ucmd()
513+
.args(&["-n", "-20001", seq_20000_truncated_file_name])
514+
.succeeds()
515+
.stdout_only_fixture("emptyfile.txt");
481516
}
482517

483518
#[cfg(all(

0 commit comments

Comments
 (0)