// Copyright ©2020 The Gonum Authors. All rights reserved. // Use of this source code is governed by a BSD-style // license that can be found in the LICENSE file. package rdf import ( "bytes" "cmp" "errors" "fmt" "hash" "slices" "sort" "gonum.org/v1/gonum/internal/order" ) // See "Canonical Forms for Isomorphic and Equivalent RDF Graphs: Algorithms // for Leaning and Labelling Blank Nodes" by Aiden Hogan for description of // the algorithm, https://doi.org/10.1145/3068333 and available free from // the author's web page http://aidanhogan.com/docs/rdf-canonicalisation.pdf. // // Aspects of implementation from discussion in v1.0 of the readme of the PoC // at https://doi.org/10.5281/zenodo.3154322 // Isomorphic returns whether the RDF graph datasets a and b are isomorphic, // where there is a bijective mapping between blank nodes in a and b using // the given hash function. If decomp is true, the graphs are decomposed // before canonicalization. func Isomorphic(a, b []*Statement, decomp bool, h hash.Hash) bool { if len(a) != len(b) { return false } zero := make([]byte, h.Size()) ah, _ := IsoCanonicalHashes(a, decomp, true, h, zero) bh, _ := IsoCanonicalHashes(b, decomp, true, h, zero) if len(ah) != len(bh) { return false } work := make([][]byte, 2*len(ah)) lexicalHashes(work[:len(ah)], ah) lexicalHashes(work[len(ah):], bh) for i := range work[:len(ah)] { if !bytes.Equal(work[i], work[i+len(ah)]) { return false } } return true } func lexicalHashes(dst [][]byte, hashes map[string][]byte) { i := 0 for _, s := range hashes { dst[i] = s i++ } order.BySliceValues(dst) } // IsoCanonicalHashes returns a mapping between the nodes of the RDF graph // dataset described by the given statements using the provided hash // function. If decomp is true, the graphs are decomposed before hashing. // If dist is true the input graph is decomposed into identical splits, the // entire graph will be hashed to distinguish nodes. If decomp is false, // dist has no effect. // Blank node hashes are initially set to the value of zero. Hash values // are provided for literal and IRI nodes as well as for blank node. The // hash input for literal nodes includes the quotes and the input for IRI // nodes first removes the angle quotes around the IRI, although these are // included in the map keys. // // Note that hashes returned by IsoCanonicalHashes with decomp=true are not // comparable with hashes returned by IsoCanonicalHashes with decomp=false. // // See http://aidanhogan.com/docs/rdf-canonicalisation.pdf for details of // the hashing algorithm. func IsoCanonicalHashes(statements []*Statement, decomp, dist bool, h hash.Hash, zero []byte) (hashes map[string][]byte, terms map[string]map[string]bool) { if len(statements) == 0 { return nil, nil } if debug { debug.log(0, "Statements:") for _, s := range statements { debug.log(0, s) } debug.log(0) } hash, parts, ok := hashBNodesPerSplit(statements, decomp, h, zero) if debug { debug.log(0, "Blanks:") if len(hash.blanks) != 0 { for _, b := range hash.blanks { debug.log(0, b) } } else { debug.log(0, "none") } debug.log(0) debug.log(0, "Parts:") debug.logParts(0, parts) debug.logf(0, "Hashes from hashBNodesPerSplit (splitting=%t):\n", decomp) debug.logHashes(0, hash.hashOf, h.Size()) } if ok { return hash.hashOf, hash.termsFor } // TODO: remove the triviality exception in distinguish and return // the original hashes if this result is nil. Make the triviality // exception optional. hashes = distinguish(statements, dist, h, zero, hash, parts, nil, 0) if hashes == nil { // distinguish was given trivial parts and // we did not ask it to try to merge them. return hash.hashOf, hash.termsFor } if debug { debug.log(0, "Final resolved Hashes:") debug.logHashes(0, hashes, h.Size()) } terms = make(map[string]map[string]bool, len(hashes)) for k, h := range hashes { terms[string(h)] = map[string]bool{k: true} } return hashes, terms } // C14n performs a relabeling of the statements in src based on the terms // obtained from IsoCanonicalHashes, placing the results in dst and returning // them. The relabeling scheme is the same as for the Universal RDF Dataset // Normalization Algorithm, blank terms are ordered lexically by their hash // value and then given a blank label with the prefix "_:c14n" and an // identifier counter corresponding to the label's sort rank. // // If dst is nil, it is allocated, otherwise the length of dst must match the // length of src. func C14n(dst, src []*Statement, terms map[string]map[string]bool) ([]*Statement, error) { if dst == nil { dst = make([]*Statement, len(src)) } if len(dst) != len(src) { return dst, errors.New("rdf: slice length mismatch") } need := make(map[string]bool) for _, s := range src { for _, t := range []string{ s.Subject.Value, s.Object.Value, s.Label.Value, } { if !isBlank(t) { continue } need[t] = true } } blanks := make([]string, len(need)) i := 0 for h, m := range terms { var ok bool for t := range m { if isBlank(t) { ok = true break } } if !ok { continue } if i == len(blanks) { return dst, errors.New("rdf: too many blanks in terms") } blanks[i] = h i++ } slices.Sort(blanks) c14n := make(map[string]string) for i, b := range blanks { if len(terms[b]) == 0 { return nil, fmt.Errorf("rdf: no term for blank with hash %x", b) } for t := range terms[b] { if !isBlank(t) { continue } if _, exists := c14n[t]; exists { continue } delete(need, t) c14n[t] = fmt.Sprintf("_:c14n%d", i) } } if len(need) != 0 { return dst, fmt.Errorf("rdf: missing term hashes for %d terms", len(need)) } for i, s := range src { if dst[i] == nil { dst[i] = &Statement{} } n := dst[i] n.Subject = Term{Value: translate(s.Subject.Value, c14n)} n.Predicate = s.Predicate n.Object = Term{Value: translate(s.Object.Value, c14n)} n.Label = Term{Value: translate(s.Label.Value, c14n)} } sortC14nStatements(dst) return dst, nil } func translate(term string, mapping map[string]string) string { if term, ok := mapping[term]; ok { return term } return term } func sortC14nStatements(statements []*Statement) { slices.SortFunc(statements, func(a, b *Statement) int { if n := cmp.Compare(a.Subject.Value, b.Subject.Value); n != 0 { return n } // Always IRI. if n := cmp.Compare(a.Predicate.Value, b.Predicate.Value); n != 0 { return n } return cmp.Compare(a.Object.Value, b.Object.Value) }) } // hashBNodes returns the hashed blank nodes of the graph described by statements // using the provided hash function. Hashes are initialised with zero. // // This is algorithm 1 in doi:10.1145/3068333. func hashBNodes(statements []*Statement, h hash.Hash, zero []byte, hash0 map[string][]byte) (hash *table, disjoint bool) { curr := newTable() for _, s := range statements { for i, t := range []string{ s.Subject.Value, s.Predicate.Value, s.Object.Value, s.Label.Value, } { switch { case i == 3 && t == "": continue case isBlank(t): if hash0 == nil { curr.set(t, zero) } else { curr.set(t, hash0[t]) } case isIRI(t): h.Reset() h.Write([]byte(t[1 : len(t)-1])) curr.set(t, h.Sum(nil)) default: h.Reset() h.Write([]byte(t)) curr.set(t, h.Sum(nil)) } } } bag := newHashBag(h, curr) last := curr.clone() for { curr, last = last, curr for _, s := range statements { if isBlank(s.Subject.Value) { var lab []byte if s.Label.Value != "" { lab = last.hashOf[s.Label.Value] } c := hashTuple(h, last.hashOf[s.Object.Value], last.hashOf[s.Predicate.Value], lab, []byte{'+'}) bag.add(s.Subject.Value, c) } if isBlank(s.Object.Value) { var lab []byte if s.Label.Value != "" { lab = last.hashOf[s.Label.Value] } c := hashTuple(h, last.hashOf[s.Subject.Value], last.hashOf[s.Predicate.Value], lab, []byte{'-'}) bag.add(s.Object.Value, c) } // This and the lab value above implement the label hashing // required for RDF dataset hashing as described in // https://doi.org/10.5281/zenodo.3154322 v1.0 // Readme.md#adaptation-of-the-algorithms-to-handle-datasets. if isBlank(s.Label.Value) { c := hashTuple(h, last.hashOf[s.Subject.Value], last.hashOf[s.Predicate.Value], last.hashOf[s.Object.Value], []byte{'.'}) bag.add(s.Label.Value, c) } } for t := range bag.hashesFor { curr.set(t, bag.sum(t)) } disjoint = curr.allUnique() if disjoint || !curr.changedFrom(last) { return curr, disjoint } } } // table is a collision aware hash collection for RDF terms. type table struct { // hashOf holds the hash for each term. hashOf map[string][]byte // termsFor holds the set of nodes in // the second key for terms that share // the hash in the first key. termsFor map[string]map[string]bool // isBlank and blanks are the set of blank // nodes. // isBlank is nil for cloned tables. isBlank map[string]bool // blanks is nil for tables created // with newTable. blanks []string } // newTable returns a new hash table. func newTable() *table { return &table{ hashOf: make(map[string][]byte), termsFor: make(map[string]map[string]bool), isBlank: make(map[string]bool), } } // wasCloned returns whether t is a parent or child of a cloning operation. func (t *table) wasCloned() bool { return t.isBlank == nil } // isNew returns whether t is a new table. func (t *table) isNew() bool { return t.blanks == nil } // clone returns a clone of the receiver. func (t *table) clone() *table { new := &table{ hashOf: make(map[string][]byte), termsFor: make(map[string]map[string]bool), } for term, hash := range t.hashOf { new.hashOf[term] = hash } for hash, coll := range t.termsFor { if len(coll) == 0 { continue } terms := make(map[string]bool) for term := range coll { terms[term] = true } new.termsFor[hash] = terms } if t.isNew() { t.blanks = make([]string, len(t.isBlank)) i := 0 for n := range t.isBlank { t.blanks[i] = n i++ } t.isBlank = nil } new.blanks = t.blanks return new } // TODO(kortschak): Make hash table in table.hashOf reuse the []byte on update. // This is not trivial since we need to check for changes, so we can't just get // the current hash buffer and write into it. So if this is done we probably // a pair of buffers, a current and a waiting. // set sets the hash of the term, removing any previously set hash. func (t *table) set(term string, hash []byte) { prev := t.hashOf[term] if bytes.Equal(prev, hash) { return } t.hashOf[term] = hash // Delete any existing hashes for this term. switch terms := t.termsFor[string(prev)]; { case len(terms) == 1: delete(t.termsFor, string(prev)) case len(terms) > 1: delete(terms, term) } terms, ok := t.termsFor[string(hash)] if ok { terms[term] = true } else { t.termsFor[string(hash)] = map[string]bool{term: true} } if !t.wasCloned() && isBlank(term) { // We are in the original table, so note // any blank node label that we see. t.isBlank[term] = true } } // allUnique returns whether every term has an unique hash. allUnique // can only be called on a table that was returned by clone. func (t *table) allUnique() bool { if t.isNew() { panic("checked hash bag from uncloned table") } for _, term := range t.blanks { if len(t.termsFor[string(t.hashOf[term])]) > 1 { return false } } return true } // changedFrom returns whether the receiver has been updated from last. // changedFrom can only be called on a table that was returned by clone. func (t *table) changedFrom(last *table) bool { if t.isNew() { panic("checked hash bag from uncloned table") } for i, x := range t.blanks { for _, y := range t.blanks[i+1:] { if bytes.Equal(t.hashOf[x], t.hashOf[y]) != bytes.Equal(last.hashOf[x], last.hashOf[y]) { return true } } } return false } // hashBag implements a commutative and associative hash. // See notes in https://doi.org/10.5281/zenodo.3154322 v1.0 // Readme.md#what-is-the-precise-specification-of-hashbag. type hashBag struct { hash hash.Hash hashesFor map[string][][]byte } // newHashBag returns a new hashBag using the provided hash function for // the given hash table. newHashBag can only take a table parameter that // was returned by newTable. func newHashBag(h hash.Hash, t *table) hashBag { if t.wasCloned() { panic("made hash bag from cloned table") } b := hashBag{hash: h, hashesFor: make(map[string][][]byte, len(t.isBlank))} for n := range t.isBlank { b.hashesFor[n] = [][]byte{t.hashOf[n]} } return b } // add adds the hash to the hash bag for the term. func (b hashBag) add(term string, hash []byte) { b.hashesFor[term] = append(b.hashesFor[term], hash) } // sum calculates the hash sum for the given term, updates the hash bag // state and returns the hash. func (b hashBag) sum(term string) []byte { p := b.hashesFor[term] order.BySliceValues(p) h := hashTuple(b.hash, p...) b.hashesFor[term] = b.hashesFor[term][:1] b.hashesFor[term][0] = h return h } // hashTuple returns the h hash of the concatenation of t. func hashTuple(h hash.Hash, t ...[]byte) []byte { h.Reset() for _, b := range t { h.Write(b) } return h.Sum(nil) } // hashBNodesPerSplit returns the independently hashed blank nodes of the // graph described by statements using the provided hash function. Hashes // are initialised with zero. // // This is algorithm 2 in doi:10.1145/3068333. func hashBNodesPerSplit(statements []*Statement, decomp bool, h hash.Hash, zero []byte) (hash *table, parts byLengthHash, disjoint bool) { if !decomp { hash, ok := hashBNodes(statements, h, zero, nil) parts = appendOrdered(byLengthHash{}, hash.termsFor) sort.Sort(parts) return hash, parts, ok } splits := split(statements) // Avoid recombination work if there is only one split. if len(splits) == 1 { hash, ok := hashBNodes(statements, h, zero, nil) parts = appendOrdered(byLengthHash{}, hash.termsFor) sort.Sort(parts) return hash, parts, ok } hash = &table{hashOf: make(map[string][]byte)} disjoint = true for _, g := range splits { part, ok := hashBNodes(g, h, zero, nil) // Each split is guaranteed to be disjoint in its // set of blank nodes, so we can just append to our // collection of blanks. hash.blanks = append(hash.blanks, part.blanks...) if !ok { // Allow a short-circuit of the allUnique check. disjoint = false } for k, v := range part.hashOf { hash.hashOf[k] = v } parts = appendOrdered(parts, part.termsFor) } sort.Sort(parts) return hash, parts, disjoint && allUnique(hash.hashOf) } // appendOrdered adds parts (labels stored in the second key) for each // hash (stored in the first key) to parts. func appendOrdered(parts byLengthHash, partSets map[string]map[string]bool) byLengthHash { for h, s := range partSets { var p []string for e := range s { if isBlank(e) { p = append(p, e) } } if p != nil { parts.nodes = append(parts.nodes, p) parts.hashes = append(parts.hashes, h) } } return parts } // byLengthHash implements ascending length sort of a set of blank RDF // term partitions with ties broken by lexical ordering of the partitions' // hashes. type byLengthHash struct { // nodes holds the blank nodes of a part. nodes [][]string // hashes holds the hashes corresponding // to the nodes in the nodes field, using // the same index. hashes []string } func (s byLengthHash) Len() int { return len(s.nodes) } func (s byLengthHash) Less(i, j int) bool { switch { case len(s.nodes[i]) < len(s.nodes[j]): return true case len(s.nodes[i]) > len(s.nodes[j]): return false } return s.hashes[i] < s.hashes[j] } func (s byLengthHash) Swap(i, j int) { s.nodes[i], s.nodes[j] = s.nodes[j], s.nodes[i] s.hashes[i], s.hashes[j] = s.hashes[j], s.hashes[i] } // allUnique returns whether the []byte hash values in hashes are all unique. func allUnique(hashes map[string][]byte) bool { set := make(map[string]bool) for _, h := range hashes { if set[string(h)] { return false } set[string(h)] = true } return true } // split returns the statements forming connected components in the graph // described by statements. // // This is split in algorithm 2 in doi:10.1145/3068333. func split(statements []*Statement) [][]*Statement { ds := make(djSet) for _, s := range statements { ds.add(s.Subject.Value) ds.add(s.Object.Value) if isBlank(s.Subject.Value) && isBlank(s.Object.Value) { ds.union(ds.find(s.Subject.Value), ds.find(s.Object.Value)) } } var ( splits [][]*Statement ground []*Statement ) idxOf := make(map[*dsNode]int) for _, s := range statements { var t string switch { case isBlank(s.Subject.Value): t = s.Subject.Value case isBlank(s.Object.Value): t = s.Object.Value default: ground = append(ground, s) continue } r := ds.find(t) if r == nil { panic(fmt.Sprintf("term not found: %q", t)) } i, ok := idxOf[r] if !ok { i = len(splits) idxOf[r] = i splits = append(splits, []*Statement{s}) } else { splits[i] = append(splits[i], s) } } if ground != nil { splits = append(splits, ground) } if debug { debug.log(0, "Splits:") for i, s := range splits { for j, t := range s { if j == 0 { debug.logf(0, "%d.\t%s\n", i+1, t) } else { debug.logf(0, "\t%s\n", t) } } debug.log(0) } } return splits } // distinguish returns G⊥: smallest hash-labelled graph found thus far. // The graph is returned as a node to hash lookup. // // This is part of algorithm 3 in doi:10.1145/3068333. // // The correspondence between the parameters for the function in the paper // with the implementation here is as follows: // - G = statements // - hash = hash // - P = parts (already sorted by hashBNodesPerSplit) // - G⊥ = lowest // - B = hash.blanks // // The additional parameter dist specifies that distinguish should treat // coequal trivial parts as a coarse of intermediate part and distinguish // the nodes in that merged part. func distinguish(statements []*Statement, dist bool, h hash.Hash, zero []byte, hash *table, parts byLengthHash, lowest map[string][]byte, depth int) map[string][]byte { if debug { debug.log(depth, "Running Distinguish") } var small []string var k int for k, small = range parts.nodes { if len(small) > 1 { break } } if len(small) < 2 { if lowest != nil || !dist { if debug { debug.log(depth, "Return lowest (no non-trivial parts):") debug.logHashes(depth, lowest, h.Size()) } return lowest } // We have been given a set of fine parts, // but to reach here they must have been // non-uniquely labeled, so treat them // as a single coarse part. k, small = 0, parts.nodes[0] } if debug { debug.logf(depth, "Part: %v %x\n\n", small, parts.hashes[k]) debug.log(depth, "Orig hash:") debug.logHashes(depth, hash.hashOf, h.Size()) } smallHash := hash.hashOf[small[0]] for _, p := range parts.nodes[k:] { if !bytes.Equal(smallHash, hash.hashOf[p[0]]) { if debug { debug.logf(depth, "End of co-equal hashes: %x != %x\n\n", smallHash, hash.hashOf[p[0]]) } break } for i, b := range p { if debug { debug.logf(depth, "Iter: %d — B = %q\n\n", i, b) if depth == 0 { debug.log(depth, "Current lowest:\n") debug.logHashes(depth, lowest, h.Size()) } } hashP := hash.clone() hashP.set(b, hashTuple(h, hashP.hashOf[b], []byte{'@'})) hashPP, ok := hashBNodes(statements, h, zero, hashP.hashOf) if ok { if debug { debug.log(depth, "hashPP is trivial") debug.log(depth, "comparing hashPP\n") debug.logHashes(depth, hashPP.hashOf, h.Size()) debug.log(depth, "with previous\n") debug.logHashes(depth, lowest, h.Size()) } if lowest == nil || graphLess(statements, hashPP.hashOf, lowest) { lowest = hashPP.hashOf debug.log(depth, "choose hashPP\n") } } else { partsP := appendOrdered(byLengthHash{}, hashPP.termsFor) sort.Sort(partsP) if debug { debug.log(depth, "Parts':") debug.logParts(depth, partsP) debug.log(depth, "Recursive distinguish") debug.log(depth, "Called with current lowest:\n") debug.logHashes(depth, lowest, h.Size()) } lowest = distinguish(statements, dist, h, zero, hashPP, partsP, lowest, depth+1) } } } if debug { debug.log(depth, "Return lowest:") debug.logHashes(depth, lowest, h.Size()) } return lowest } // terms ordered syntactically, triples ordered lexicographically, and graphs // ordered such that G < H if and only if G ⊂ H or there exists a triple // t ∈ G \ H such that no triple t' ∈ H \ G exists where t' < t. // p9 https://doi.org/10.1145/3068333 func graphLess(statements []*Statement, a, b map[string][]byte) bool { g := newLexicalStatements(statements, a) sort.Sort(g) h := newLexicalStatements(statements, b) sort.Sort(h) gSubH := sub(g, h, len(g.statements)) if len(gSubH) == 0 { return true } hSubG := sub(h, g, 1) if len(hSubG) == 0 { return true } lowestH := relabeledStatement{hSubG[0], h.hashes} for _, s := range gSubH { rs := relabeledStatement{s, g.hashes} if rs.less(lowestH) { return true } } return false } // lexicalStatements is a sort implementation for Statements with blank // node labels replaced with their hash. type lexicalStatements struct { statements []*Statement hashes map[string][]byte } func newLexicalStatements(statements []*Statement, hash map[string][]byte) lexicalStatements { s := lexicalStatements{ statements: make([]*Statement, len(statements)), hashes: hash, } copy(s.statements, statements) return s } // sub returns the difference between a and b up to max elements long. func sub(a, b lexicalStatements, max int) []*Statement { var d []*Statement var i, j int for i < len(a.statements) && j < len(b.statements) && len(d) < max { ra := relabeledStatement{a.statements[i], a.hashes} rb := relabeledStatement{b.statements[j], b.hashes} switch { case ra.less(rb): d = append(d, a.statements[i]) i++ case rb.less(ra): j++ default: i++ } } if len(d) < max { d = append(d, a.statements[i:min(len(a.statements), i+max-len(d))]...) } return d } func (s lexicalStatements) Len() int { return len(s.statements) } func (s lexicalStatements) Less(i, j int) bool { return relabeledStatement{s.statements[i], s.hashes}.less(relabeledStatement{s.statements[j], s.hashes}) } func (s lexicalStatements) Swap(i, j int) { s.statements[i], s.statements[j] = s.statements[j], s.statements[i] } // relabeledStatement is a statement that is orderable by its blank node // hash relabeling. type relabeledStatement struct { statement *Statement labels map[string][]byte } func (a relabeledStatement) less(b relabeledStatement) bool { switch { case relabeledTerm{a.statement.Subject, a.labels}.less(relabeledTerm{b.statement.Subject, b.labels}): return true case relabeledTerm{b.statement.Subject, b.labels}.less(relabeledTerm{a.statement.Subject, a.labels}): return false } switch { // Always IRI. case a.statement.Predicate.Value < b.statement.Predicate.Value: return true case a.statement.Predicate.Value > b.statement.Predicate.Value: return false } switch { case relabeledTerm{a.statement.Object, a.labels}.less(relabeledTerm{b.statement.Object, b.labels}): return true case relabeledTerm{b.statement.Object, b.labels}.less(relabeledTerm{a.statement.Object, a.labels}): return false } return relabeledTerm{a.statement.Label, a.labels}.less(relabeledTerm{b.statement.Label, b.labels}) } func (s relabeledStatement) String() string { subj := relabeledTerm{term: s.statement.Subject, labels: s.labels} obj := relabeledTerm{term: s.statement.Object, labels: s.labels} if s.statement.Label.Value == "" { return fmt.Sprintf("%s %s %s .", subj, s.statement.Predicate.Value, obj) } lab := relabeledTerm{term: s.statement.Label, labels: s.labels} return fmt.Sprintf("%s %s %s %s .", subj, s.statement.Predicate.Value, obj, lab) } // relabeledTerm is a term that is orderable by its blank node hash relabeling. type relabeledTerm struct { term Term labels map[string][]byte } func (a relabeledTerm) less(b relabeledTerm) bool { aIsBlank := isBlank(a.term.Value) bIsBlank := isBlank(b.term.Value) switch { case aIsBlank && bIsBlank: return bytes.Compare(a.labels[a.term.Value], b.labels[b.term.Value]) < 0 case aIsBlank: return blankPrefix < unquoteIRI(b.term.Value) case bIsBlank: return unquoteIRI(a.term.Value) < blankPrefix default: return unquoteIRI(a.term.Value) < unquoteIRI(b.term.Value) } } func unquoteIRI(s string) string { if len(s) > 1 && s[0] == '<' && s[len(s)-1] == '>' { s = s[1 : len(s)-1] } return s } func (t relabeledTerm) String() string { if !isBlank(t.term.Value) { return t.term.Value } h, ok := t.labels[t.term.Value] if !ok { return t.term.Value + "_missing_hash" } return fmt.Sprintf("_:%0x", h) }