Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

NQuads, TriG: Support writing to the default graph #615

Merged
merged 5 commits into from
Feb 5, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,9 @@ flycheck_*.el
/nemo/test-files
/nemo-benches/test-files

# default output directory
/results

# nix build
/result

Expand Down
4 changes: 2 additions & 2 deletions CONTRIBUTING.md
Original file line number Diff line number Diff line change
Expand Up @@ -72,12 +72,12 @@ Add `quickcheck` tests whenever it is applicable.

### Integration testing

Integration testing is done in the related `tests` directory on the top-level of this crate.
Integration testing is done in the related `tests` directory on the top-level of the `nemo-cli` crate.

## Coding conventions

Start reading our code and you'll get the hang of it. Code format and essential coding guidelines are already ensured
by our use of `rstufmt` and `clippy` (as mentioned above). Some further conventions are listed below.
by our use of `rustfmt` and `clippy` (as mentioned above). Some further conventions are listed below.

* We try to reduce redundancies in enumeration-variant names.
* We try to use the `where` clause over embedded clauses for better readability
Expand Down
2 changes: 1 addition & 1 deletion Cargo.toml
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
[workspace]
resolver = "2"
default-members = [
"nemo",
"nemo",
"nemo-cli",
"nemo-physical",
"nemo-python",
Expand Down
1 change: 0 additions & 1 deletion nemo-language-server/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,6 @@ homepage.workspace = true
license.workspace = true
readme = "README.md"
repository.workspace = true
default-run = "nemo-language-server"

[[bin]]
name = "nemo-language-server"
Expand Down
7 changes: 7 additions & 0 deletions nemo/src/io/formats/rdf.rs
Original file line number Diff line number Diff line change
Expand Up @@ -44,6 +44,13 @@ use crate::{
use super::FileFormatMeta;
use super::{ExportHandler, FormatBuilder, ImportHandler, TableWriter};

/// IRI to be used for the default graph used by Nemo when loading RDF data with
/// named graphs (quads).
///
/// SPARQL 1.1 has failed to provide any standard identifier for this purpose.
/// If future SPARQL or RDF versions are adding this, we could align accordingly.
const DEFAULT_GRAPH_IRI: &str = "tag:nemo:defaultgraph";

/// The different supported variants of the RDF format.
#[derive(Assoc, Debug, Clone, Copy, PartialEq, Eq, VariantArray)]
#[func(pub fn media_type(&self) -> &'static str)]
Expand Down
128 changes: 8 additions & 120 deletions nemo/src/io/formats/rdf/reader.rs
Original file line number Diff line number Diff line change
Expand Up @@ -27,16 +27,9 @@ use crate::io::formats::PROGRESS_NOTIFY_INCREMENT;
use super::{
error::RdfFormatError,
value_format::{RdfValueFormat, RdfValueFormats},
RdfVariant,
RdfVariant, DEFAULT_GRAPH_IRI,
};

/// IRI to be used for the default graph used by Nemo when loading RDF data with
/// named graphs (quads).
///
/// SPARQL 1.1 has failed to provide any standard identifier for this purpose.
/// If future SPARQL or RDF versions are adding this, we could align accordingly.
const DEFAULT_GRAPH: &str = "tag:nemo:defaultgraph";

/// A [TableProvider] for RDF 1.1 files containing triples.
pub(super) struct RdfReader {
/// Buffer from which content is read
Expand Down Expand Up @@ -157,7 +150,7 @@ impl RdfReader {
value: Option<GraphName<'_>>,
) -> Result<AnyDataValue, RdfFormatError> {
match value {
None => Ok(AnyDataValue::new_iri(DEFAULT_GRAPH.to_string())),
None => Ok(AnyDataValue::new_iri(DEFAULT_GRAPH_IRI.to_string())),
Some(GraphName::NamedNode(nn)) => Ok(Self::datavalue_from_named_node(nn)),
Some(GraphName::BlankNode(bn)) => {
Ok(Self::datavalue_from_blank_node(bnode_map, tuple_writer, bn))
Expand Down Expand Up @@ -368,7 +361,7 @@ impl ByteSized for RdfReader {

#[cfg(test)]
mod test {
use super::{RdfReader, DEFAULT_GRAPH};
use super::{RdfReader, DEFAULT_GRAPH_IRI};
use std::cell::RefCell;

use nemo_physical::{
Expand Down Expand Up @@ -454,7 +447,7 @@ mod test {
let dict = RefCell::new(Dict::default());
let mut tuple_writer = TupleWriter::new(&dict, 3);
let mut null_map = NullMap::default();
let graph_dv = AnyDataValue::new_iri(DEFAULT_GRAPH.to_string());
let graph_dv = AnyDataValue::new_iri(DEFAULT_GRAPH_IRI.to_string());

// check that we use our own default graph IRI
assert_eq!(
Expand All @@ -463,115 +456,10 @@ mod test {
);
// check that our default graph is a valid IRI in the first place
assert_eq!(
Iri::parse(DEFAULT_GRAPH.to_string()).unwrap().to_string(),
DEFAULT_GRAPH.to_string()
Iri::parse(DEFAULT_GRAPH_IRI.to_string())
.unwrap()
.to_string(),
DEFAULT_GRAPH_IRI.to_string()
);
}

// #[test]
// fn example_1() {
// macro_rules! parse_example_with_rdf_parser {
// ($data:tt, $make_parser:expr) => {
// let $data = r#"<http://one.example/subject1> <http://one.example/predicate1> <http://one.example/object1> . # comments here
// # or on a line by themselves
// _:subject1 <http://an.example/predicate1> "object1" .
// _:subject2 <http://an.example/predicate2> "object2" .
// "#.as_bytes();

// let dict = RefCell::new(Dict::default());
// let mut builders = vec![
// PhysicalBuilderProxyEnum::String(PhysicalStringColumnBuilderProxy::new(&dict)),
// PhysicalBuilderProxyEnum::String(PhysicalStringColumnBuilderProxy::new(&dict)),
// PhysicalBuilderProxyEnum::String(PhysicalStringColumnBuilderProxy::new(&dict)),
// ];
// let reader = RDFReader::new(ResourceProviders::empty(), String::new(), None, vec![PrimitiveType::Any, PrimitiveType::Any, PrimitiveType::Any]);

// let result = reader.read_triples_with_parser(&mut builders, $make_parser);
// assert!(result.is_ok());

// let columns = builders
// .into_iter()
// .map(|builder| match builder {
// PhysicalBuilderProxyEnum::String(b) => b.finalize(),
// _ => unreachable!("only string columns here"),
// })
// .collect::<Vec<_>>();

// log::debug!("columns: {columns:?}");
// let triples = (0..=2)
// .map(|idx| {
// columns
// .iter()
// .map(|column| {
// column
// .get(idx)
// .and_then(|value| value.try_into().ok())
// .and_then(|u64: u64| usize::try_from(u64).ok())
// .and_then(|usize| dict.borrow_mut().get(usize))
// .unwrap()
// })
// .map(PhysicalString::from)
// .collect::<Vec<_>>()
// })
// .collect::<Vec<_>>();
// log::debug!("triple: {triples:?}");
// for (value, expected) in PrimitiveType::Any.serialize_output(DataValueIteratorT::String(Box::new(triples[0].iter().cloned()))).zip(vec!["http://one.example/subject1", "http://one.example/predicate1", "http://one.example/object1"]) {
// assert_eq!(value, expected);
// }
// for (value, expected) in PrimitiveType::Any.serialize_output(DataValueIteratorT::String(Box::new(triples[1].iter().cloned()))).zip(vec!["_:subject1", "http://an.example/predicate1", r#""object1""#]) {
// assert_eq!(value, expected);
// }
// for (value, expected) in PrimitiveType::Any.serialize_output(DataValueIteratorT::String(Box::new(triples[2].iter().cloned()))).zip(vec!["_:subject2", "http://an.example/predicate2", r#""object2""#]) {
// assert_eq!(value, expected);
// }
// };
// }

// parse_example_with_rdf_parser!(reader, || NTriplesParser::new(reader));
// parse_example_with_rdf_parser!(reader, || TurtleParser::new(reader, None));
// }

// #[test]
// fn rollback() {
// let data = r#"<http://example.org/> <http://example.org/> <http://example.org/> .
// malformed <http://example.org/> <http://example.org/>
// <http://example.org/> malformed <http://example.org/> .
// <http://example.org/> <http://example.org/> malformed .
// <http://example.org/> <http://example.org/> "123"^^<http://www.w3.org/2001/XMLSchema#integer> .
// <http://example.org/> <http://example.org/> "123.45"^^<http://www.w3.org/2001/XMLSchema#integer> .
// <http://example.org/> <http://example.org/> "123.45"^^<http://www.w3.org/2001/XMLSchema#decimal> .
// <http://example.org/> <http://example.org/> "123.45a"^^<http://www.w3.org/2001/XMLSchema#decimal> .
// <https://example.org/> <https://example.org/> <https://example.org/> .
// "#
// .as_bytes();

// let dict = RefCell::new(Dict::default());
// let mut builders = vec![
// PhysicalBuilderProxyEnum::String(PhysicalStringColumnBuilderProxy::new(&dict)),
// PhysicalBuilderProxyEnum::String(PhysicalStringColumnBuilderProxy::new(&dict)),
// PhysicalBuilderProxyEnum::String(PhysicalStringColumnBuilderProxy::new(&dict)),
// ];
// let reader = RDFReader::new(
// ResourceProviders::empty(),
// String::new(),
// None,
// vec![PrimitiveType::Any, PrimitiveType::Any, PrimitiveType::Any],
// );

// let result = reader.read_triples_with_parser(&mut builders, || NTriplesParser::new(data));
// assert!(result.is_ok());

// let columns = builders
// .into_iter()
// .map(|builder| match builder {
// PhysicalBuilderProxyEnum::String(b) => b.finalize(),
// _ => unreachable!("only string columns here"),
// })
// .collect::<Vec<_>>();

// assert_eq!(columns.len(), 3);
// assert_eq!(columns[0].len(), 4);
// assert_eq!(columns[1].len(), 4);
// assert_eq!(columns[2].len(), 4);
// }
}
82 changes: 54 additions & 28 deletions nemo/src/io/formats/rdf/writer.rs
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@ use crate::{

use super::{
value_format::{RdfValueFormat, RdfValueFormats},
RdfVariant,
RdfVariant, DEFAULT_GRAPH_IRI,
};

/// Private struct to record the type of an RDF term that
Expand All @@ -31,13 +31,43 @@ enum RdfTermType {
SimpleStringLiteral,
}

#[derive(Debug, Default)]
enum QuadGraphName {
#[default]
DefaultGraph,
NamedNode(String),
BlankNode(String),
}

#[derive(Debug)]
struct InvalidGraphNameError;

impl TryFrom<&AnyDataValue> for QuadGraphName {
type Error = InvalidGraphNameError;

fn try_from(value: &AnyDataValue) -> Result<Self, Self::Error> {
match value.value_domain() {
ValueDomain::Iri => {
let iri = value.to_iri_unchecked();

if iri == DEFAULT_GRAPH_IRI {
Ok(Self::DefaultGraph)
} else {
Ok(Self::NamedNode(iri))
}
}
ValueDomain::Null => Ok(Self::BlankNode(value.lexical_value())),
_ => Err(InvalidGraphNameError),
}
}
}

/// Struct to store information of one quad (or triple) for export.
/// This is necessary since all RIO RDF term implementations use `&str`
/// pointers internally, that must be owned elsewhere.
#[derive(Debug, Default)]
struct QuadBuffer {
graph_name_is_blank: bool,
graph_name: String,
graph_name: QuadGraphName,
subject_is_blank: bool,
subject: String,
predicate: String,
Expand Down Expand Up @@ -88,15 +118,15 @@ impl<'a> QuadBuffer {
}
}

fn graph_name(&'a self) -> GraphName<'a> {
if self.graph_name_is_blank {
GraphName::BlankNode(BlankNode {
id: self.graph_name.as_str(),
})
} else {
GraphName::NamedNode(NamedNode {
iri: self.graph_name.as_str(),
})
fn graph_name(&'a self) -> Option<GraphName<'a>> {
match &self.graph_name {
QuadGraphName::DefaultGraph => None,
QuadGraphName::NamedNode(iri) => {
Some(GraphName::NamedNode(NamedNode { iri: iri.as_str() }))
}
QuadGraphName::BlankNode(id) => {
Some(GraphName::BlankNode(BlankNode { id: id.as_str() }))
}
}
}

Expand Down Expand Up @@ -170,20 +200,13 @@ impl<'a> QuadBuffer {
true
}

fn set_graph_name_from_datavalue(&mut self, datavalue: &AnyDataValue) -> bool {
match datavalue.value_domain() {
ValueDomain::Iri => {
self.graph_name = datavalue.to_iri_unchecked();
self.graph_name_is_blank = false;
true
}
ValueDomain::Null => {
self.graph_name = datavalue.lexical_value();
self.graph_name_is_blank = true;
true
}
_ => false,
}
fn set_graph_name_from_datavalue(
&mut self,
datavalue: &AnyDataValue,
) -> Result<(), InvalidGraphNameError> {
self.graph_name = QuadGraphName::try_from(datavalue)?;

Ok(())
}
}

Expand Down Expand Up @@ -318,14 +341,17 @@ impl RdfWriter {
if !buffer.set_object_from_datavalue(&record[o_pos]) {
continue;
}
if !buffer.set_graph_name_from_datavalue(&record[g_pos]) {
if buffer
.set_graph_name_from_datavalue(&record[g_pos])
.is_err()
{
continue;
}
if let Err(e) = formatter.format(&Quad {
subject: buffer.subject(),
predicate: buffer.predicate(),
object: buffer.object(),
graph_name: Some(buffer.graph_name()),
graph_name: buffer.graph_name(),
}) {
log::debug!("failed to write quad: {e}");
drop_count += 1;
Expand Down