Skip to content

Commit 39706b1

Browse files
authored
fs: try doing a non-blocking read before punting to the threadpool (#3518)
1 parent cab4a59 commit 39706b1

File tree

3 files changed

+213
-5
lines changed

3 files changed

+213
-5
lines changed

tokio/Cargo.toml

+3-3
Original file line numberDiff line numberDiff line change
@@ -42,7 +42,7 @@ full = [
4242
"time",
4343
]
4444

45-
fs = []
45+
fs = ["libc"]
4646
io-util = ["memchr", "bytes"]
4747
# stdin, stdout, stderr
4848
io-std = []
@@ -103,11 +103,11 @@ parking_lot = { version = "0.11.0", optional = true }
103103
tracing = { version = "0.1.21", default-features = false, features = ["std"], optional = true } # Not in full
104104

105105
[target.'cfg(unix)'.dependencies]
106-
libc = { version = "0.2.42", optional = true }
106+
libc = { version = "0.2.87", optional = true }
107107
signal-hook-registry = { version = "1.1.1", optional = true }
108108

109109
[target.'cfg(unix)'.dev-dependencies]
110-
libc = { version = "0.2.42" }
110+
libc = { version = "0.2.87" }
111111
nix = { version = "0.19.0" }
112112

113113
[target.'cfg(windows)'.dependencies.winapi]

tokio/src/fs/file.rs

+189-2
Original file line numberDiff line numberDiff line change
@@ -491,14 +491,18 @@ impl AsyncRead for File {
491491
loop {
492492
match inner.state {
493493
Idle(ref mut buf_cell) => {
494-
let mut buf = buf_cell.take().unwrap();
494+
let buf = buf_cell.as_mut().unwrap();
495495

496496
if !buf.is_empty() {
497497
buf.copy_to(dst);
498-
*buf_cell = Some(buf);
499498
return Ready(Ok(()));
500499
}
501500

501+
if let Some(x) = try_nonblocking_read(me.std.as_ref(), dst) {
502+
return Ready(x);
503+
}
504+
505+
let mut buf = buf_cell.take().unwrap();
502506
buf.ensure_capacity_for(dst);
503507
let std = me.std.clone();
504508

@@ -756,3 +760,186 @@ impl Inner {
756760
}
757761
}
758762
}
763+
764+
#[cfg(all(target_os = "linux", not(test)))]
765+
pub(crate) fn try_nonblocking_read(
766+
file: &crate::fs::sys::File,
767+
dst: &mut ReadBuf<'_>,
768+
) -> Option<std::io::Result<()>> {
769+
use std::sync::atomic::{AtomicBool, Ordering};
770+
771+
static NONBLOCKING_READ_SUPPORTED: AtomicBool = AtomicBool::new(true);
772+
if !NONBLOCKING_READ_SUPPORTED.load(Ordering::Relaxed) {
773+
return None;
774+
}
775+
let out = preadv2::preadv2_safe(file, dst, -1, preadv2::RWF_NOWAIT);
776+
if let Err(err) = &out {
777+
match err.raw_os_error() {
778+
Some(libc::ENOSYS) => {
779+
NONBLOCKING_READ_SUPPORTED.store(false, Ordering::Relaxed);
780+
return None;
781+
}
782+
Some(libc::ENOTSUP) | Some(libc::EAGAIN) => return None,
783+
_ => {}
784+
}
785+
}
786+
Some(out)
787+
}
788+
789+
#[cfg(any(not(target_os = "linux"), test))]
790+
pub(crate) fn try_nonblocking_read(
791+
_file: &crate::fs::sys::File,
792+
_dst: &mut ReadBuf<'_>,
793+
) -> Option<std::io::Result<()>> {
794+
None
795+
}
796+
797+
#[cfg(target_os = "linux")]
798+
mod preadv2 {
799+
use libc::{c_int, c_long, c_void, iovec, off_t, ssize_t};
800+
use std::os::unix::prelude::AsRawFd;
801+
802+
use crate::io::ReadBuf;
803+
804+
pub(crate) fn preadv2_safe(
805+
file: &std::fs::File,
806+
dst: &mut ReadBuf<'_>,
807+
offset: off_t,
808+
flags: c_int,
809+
) -> std::io::Result<()> {
810+
unsafe {
811+
/* We have to defend against buffer overflows manually here. The slice API makes
812+
* this fairly straightforward. */
813+
let unfilled = dst.unfilled_mut();
814+
let mut iov = iovec {
815+
iov_base: unfilled.as_mut_ptr() as *mut c_void,
816+
iov_len: unfilled.len(),
817+
};
818+
/* We take a File object rather than an fd as reading from a sensitive fd may confuse
819+
* other unsafe code that assumes that only they have access to that fd. */
820+
let bytes_read = preadv2(
821+
file.as_raw_fd(),
822+
&mut iov as *mut iovec as *const iovec,
823+
1,
824+
offset,
825+
flags,
826+
);
827+
if bytes_read < 0 {
828+
Err(std::io::Error::last_os_error())
829+
} else {
830+
/* preadv2 returns the number of bytes read, e.g. the number of bytes that have
831+
* written into `unfilled`. So it's safe to assume that the data is now
832+
* initialised */
833+
dst.assume_init(bytes_read as usize);
834+
dst.advance(bytes_read as usize);
835+
Ok(())
836+
}
837+
}
838+
}
839+
840+
#[cfg(test)]
841+
mod test {
842+
use super::*;
843+
844+
#[test]
845+
fn test_preadv2_safe() {
846+
use std::io::{Seek, Write};
847+
use std::mem::MaybeUninit;
848+
use tempfile::tempdir;
849+
850+
let tmp = tempdir().unwrap();
851+
let filename = tmp.path().join("file");
852+
const MESSAGE: &[u8] = b"Hello this is a test";
853+
{
854+
let mut f = std::fs::File::create(&filename).unwrap();
855+
f.write_all(MESSAGE).unwrap();
856+
}
857+
let f = std::fs::File::open(&filename).unwrap();
858+
859+
let mut buf = [MaybeUninit::<u8>::new(0); 50];
860+
let mut br = ReadBuf::uninit(&mut buf);
861+
862+
// Basic use:
863+
preadv2_safe(&f, &mut br, 0, 0).unwrap();
864+
assert_eq!(br.initialized().len(), MESSAGE.len());
865+
assert_eq!(br.filled(), MESSAGE);
866+
867+
// Here we check that offset works, but also that appending to a non-empty buffer
868+
// behaves correctly WRT initialisation.
869+
preadv2_safe(&f, &mut br, 5, 0).unwrap();
870+
assert_eq!(br.initialized().len(), MESSAGE.len() * 2 - 5);
871+
assert_eq!(br.filled(), b"Hello this is a test this is a test".as_ref());
872+
873+
// offset of -1 means use the current cursor. This has not been advanced by the
874+
// previous reads because we specified an offset there.
875+
preadv2_safe(&f, &mut br, -1, 0).unwrap();
876+
assert_eq!(br.remaining(), 0);
877+
assert_eq!(
878+
br.filled(),
879+
b"Hello this is a test this is a testHello this is a".as_ref()
880+
);
881+
882+
// but the offset should have been advanced by that read
883+
br.clear();
884+
preadv2_safe(&f, &mut br, -1, 0).unwrap();
885+
assert_eq!(br.filled(), b" test");
886+
887+
// This should be in cache, so RWF_NOWAIT should work, but it not being in cache
888+
// (EAGAIN) or not supported by the underlying filesystem (ENOTSUP) is fine too.
889+
br.clear();
890+
match preadv2_safe(&f, &mut br, 0, RWF_NOWAIT) {
891+
Ok(()) => assert_eq!(br.filled(), MESSAGE),
892+
Err(e) => assert!(matches!(
893+
e.raw_os_error(),
894+
Some(libc::ENOTSUP) | Some(libc::EAGAIN)
895+
)),
896+
}
897+
898+
// Test handling large offsets
899+
{
900+
// I hope the underlying filesystem supports sparse files
901+
let mut w = std::fs::OpenOptions::new()
902+
.write(true)
903+
.open(&filename)
904+
.unwrap();
905+
w.set_len(0x1_0000_0000).unwrap();
906+
w.seek(std::io::SeekFrom::Start(0x1_0000_0000)).unwrap();
907+
w.write_all(b"This is a Large File").unwrap();
908+
}
909+
910+
br.clear();
911+
preadv2_safe(&f, &mut br, 0x1_0000_0008, 0).unwrap();
912+
assert_eq!(br.filled(), b"a Large File");
913+
}
914+
}
915+
916+
fn pos_to_lohi(offset: off_t) -> (c_long, c_long) {
917+
// 64-bit offset is split over high and low 32-bits on 32-bit architectures.
918+
// 64-bit architectures still have high and low arguments, but only the low
919+
// one is inspected. See pos_from_hilo in linux/fs/read_write.c.
920+
const HALF_LONG_BITS: usize = core::mem::size_of::<c_long>() * 8 / 2;
921+
(
922+
offset as c_long,
923+
// We want to shift this off_t value by size_of::<c_long>(). We can't do
924+
// it in one shift because if they're both 64-bits we'd be doing u64 >> 64
925+
// which is implementation defined. Instead do it in two halves:
926+
((offset >> HALF_LONG_BITS) >> HALF_LONG_BITS) as c_long,
927+
)
928+
}
929+
930+
pub(crate) const RWF_NOWAIT: c_int = 0x00000008;
931+
unsafe fn preadv2(
932+
fd: c_int,
933+
iov: *const iovec,
934+
iovcnt: c_int,
935+
offset: off_t,
936+
flags: c_int,
937+
) -> ssize_t {
938+
// Call via libc::syscall rather than libc::preadv2. preadv2 is only supported by glibc
939+
// and only since v2.26. By using syscall we don't need to worry about compatiblity with
940+
// old glibc versions and it will work on Android and musl too. The downside is that you
941+
// can't use `LD_PRELOAD` tricks any more to intercept these calls.
942+
let (lo, hi) = pos_to_lohi(offset);
943+
libc::syscall(libc::SYS_preadv2, fd, iov, iovcnt, lo, hi, flags) as ssize_t
944+
}
945+
}

tokio/tests/fs_file.rs

+21
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,27 @@ async fn basic_read() {
2222

2323
assert_eq!(n, HELLO.len());
2424
assert_eq!(&buf[..n], HELLO);
25+
26+
// Drop the data from the cache to stimulate uncached codepath on Linux (see preadv2 in
27+
// file.rs)
28+
#[cfg(target_os = "linux")]
29+
{
30+
use std::os::unix::io::AsRawFd;
31+
nix::unistd::fsync(tempfile.as_raw_fd()).unwrap();
32+
nix::fcntl::posix_fadvise(
33+
tempfile.as_raw_fd(),
34+
0,
35+
0,
36+
nix::fcntl::PosixFadviseAdvice::POSIX_FADV_DONTNEED,
37+
)
38+
.unwrap();
39+
}
40+
41+
let mut file = File::open(tempfile.path()).await.unwrap();
42+
let n = file.read(&mut buf).await.unwrap();
43+
44+
assert_eq!(n, HELLO.len());
45+
assert_eq!(&buf[..n], HELLO);
2546
}
2647

2748
#[tokio::test]

0 commit comments

Comments
 (0)