mirror of
https://github.com/gonum/gonum.git
synced 2025-10-05 15:16:59 +08:00
940 lines
25 KiB
Go
940 lines
25 KiB
Go
// Copyright ©2020 The Gonum Authors. All rights reserved.
|
|
// Use of this source code is governed by a BSD-style
|
|
// license that can be found in the LICENSE file.
|
|
|
|
package rdf
|
|
|
|
import (
|
|
"bytes"
|
|
"cmp"
|
|
"errors"
|
|
"fmt"
|
|
"hash"
|
|
"slices"
|
|
"sort"
|
|
|
|
"gonum.org/v1/gonum/internal/order"
|
|
)
|
|
|
|
// See "Canonical Forms for Isomorphic and Equivalent RDF Graphs: Algorithms
|
|
// for Leaning and Labelling Blank Nodes" by Aiden Hogan for description of
|
|
// the algorithm, https://doi.org/10.1145/3068333 and available free from
|
|
// the author's web page http://aidanhogan.com/docs/rdf-canonicalisation.pdf.
|
|
//
|
|
// Aspects of implementation from discussion in v1.0 of the readme of the PoC
|
|
// at https://doi.org/10.5281/zenodo.3154322
|
|
|
|
// Isomorphic returns whether the RDF graph datasets a and b are isomorphic,
|
|
// where there is a bijective mapping between blank nodes in a and b using
|
|
// the given hash function. If decomp is true, the graphs are decomposed
|
|
// before canonicalization.
|
|
func Isomorphic(a, b []*Statement, decomp bool, h hash.Hash) bool {
|
|
if len(a) != len(b) {
|
|
return false
|
|
}
|
|
|
|
zero := make([]byte, h.Size())
|
|
ah, _ := IsoCanonicalHashes(a, decomp, true, h, zero)
|
|
bh, _ := IsoCanonicalHashes(b, decomp, true, h, zero)
|
|
if len(ah) != len(bh) {
|
|
return false
|
|
}
|
|
|
|
work := make([][]byte, 2*len(ah))
|
|
lexicalHashes(work[:len(ah)], ah)
|
|
lexicalHashes(work[len(ah):], bh)
|
|
for i := range work[:len(ah)] {
|
|
if !bytes.Equal(work[i], work[i+len(ah)]) {
|
|
return false
|
|
}
|
|
}
|
|
return true
|
|
}
|
|
|
|
func lexicalHashes(dst [][]byte, hashes map[string][]byte) {
|
|
i := 0
|
|
for _, s := range hashes {
|
|
dst[i] = s
|
|
i++
|
|
}
|
|
order.BySliceValues(dst)
|
|
}
|
|
|
|
// IsoCanonicalHashes returns a mapping between the nodes of the RDF graph
|
|
// dataset described by the given statements using the provided hash
|
|
// function. If decomp is true, the graphs are decomposed before hashing.
|
|
// If dist is true the input graph is decomposed into identical splits, the
|
|
// entire graph will be hashed to distinguish nodes. If decomp is false,
|
|
// dist has no effect.
|
|
// Blank node hashes are initially set to the value of zero. Hash values
|
|
// are provided for literal and IRI nodes as well as for blank node. The
|
|
// hash input for literal nodes includes the quotes and the input for IRI
|
|
// nodes first removes the angle quotes around the IRI, although these are
|
|
// included in the map keys.
|
|
//
|
|
// Note that hashes returned by IsoCanonicalHashes with decomp=true are not
|
|
// comparable with hashes returned by IsoCanonicalHashes with decomp=false.
|
|
//
|
|
// See http://aidanhogan.com/docs/rdf-canonicalisation.pdf for details of
|
|
// the hashing algorithm.
|
|
func IsoCanonicalHashes(statements []*Statement, decomp, dist bool, h hash.Hash, zero []byte) (hashes map[string][]byte, terms map[string]map[string]bool) {
|
|
if len(statements) == 0 {
|
|
return nil, nil
|
|
}
|
|
|
|
if debug {
|
|
debug.log(0, "Statements:")
|
|
for _, s := range statements {
|
|
debug.log(0, s)
|
|
}
|
|
debug.log(0)
|
|
}
|
|
|
|
hash, parts, ok := hashBNodesPerSplit(statements, decomp, h, zero)
|
|
|
|
if debug {
|
|
debug.log(0, "Blanks:")
|
|
if len(hash.blanks) != 0 {
|
|
for _, b := range hash.blanks {
|
|
debug.log(0, b)
|
|
}
|
|
} else {
|
|
debug.log(0, "none")
|
|
}
|
|
debug.log(0)
|
|
|
|
debug.log(0, "Parts:")
|
|
debug.logParts(0, parts)
|
|
|
|
debug.logf(0, "Hashes from hashBNodesPerSplit (splitting=%t):\n", decomp)
|
|
debug.logHashes(0, hash.hashOf, h.Size())
|
|
}
|
|
|
|
if ok {
|
|
return hash.hashOf, hash.termsFor
|
|
}
|
|
|
|
// TODO: remove the triviality exception in distinguish and return
|
|
// the original hashes if this result is nil. Make the triviality
|
|
// exception optional.
|
|
hashes = distinguish(statements, dist, h, zero, hash, parts, nil, 0)
|
|
|
|
if hashes == nil {
|
|
// distinguish was given trivial parts and
|
|
// we did not ask it to try to merge them.
|
|
return hash.hashOf, hash.termsFor
|
|
}
|
|
|
|
if debug {
|
|
debug.log(0, "Final resolved Hashes:")
|
|
debug.logHashes(0, hashes, h.Size())
|
|
}
|
|
|
|
terms = make(map[string]map[string]bool, len(hashes))
|
|
for k, h := range hashes {
|
|
terms[string(h)] = map[string]bool{k: true}
|
|
}
|
|
|
|
return hashes, terms
|
|
}
|
|
|
|
// C14n performs a relabeling of the statements in src based on the terms
|
|
// obtained from IsoCanonicalHashes, placing the results in dst and returning
|
|
// them. The relabeling scheme is the same as for the Universal RDF Dataset
|
|
// Normalization Algorithm, blank terms are ordered lexically by their hash
|
|
// value and then given a blank label with the prefix "_:c14n" and an
|
|
// identifier counter corresponding to the label's sort rank.
|
|
//
|
|
// If dst is nil, it is allocated, otherwise the length of dst must match the
|
|
// length of src.
|
|
func C14n(dst, src []*Statement, terms map[string]map[string]bool) ([]*Statement, error) {
|
|
if dst == nil {
|
|
dst = make([]*Statement, len(src))
|
|
}
|
|
|
|
if len(dst) != len(src) {
|
|
return dst, errors.New("rdf: slice length mismatch")
|
|
}
|
|
|
|
need := make(map[string]bool)
|
|
for _, s := range src {
|
|
for _, t := range []string{
|
|
s.Subject.Value,
|
|
s.Object.Value,
|
|
s.Label.Value,
|
|
} {
|
|
if !isBlank(t) {
|
|
continue
|
|
}
|
|
need[t] = true
|
|
}
|
|
}
|
|
|
|
blanks := make([]string, len(need))
|
|
i := 0
|
|
for h, m := range terms {
|
|
var ok bool
|
|
for t := range m {
|
|
if isBlank(t) {
|
|
ok = true
|
|
break
|
|
}
|
|
}
|
|
if !ok {
|
|
continue
|
|
}
|
|
if i == len(blanks) {
|
|
return dst, errors.New("rdf: too many blanks in terms")
|
|
}
|
|
blanks[i] = h
|
|
i++
|
|
}
|
|
slices.Sort(blanks)
|
|
|
|
c14n := make(map[string]string)
|
|
for i, b := range blanks {
|
|
if len(terms[b]) == 0 {
|
|
return nil, fmt.Errorf("rdf: no term for blank with hash %x", b)
|
|
}
|
|
for t := range terms[b] {
|
|
if !isBlank(t) {
|
|
continue
|
|
}
|
|
if _, exists := c14n[t]; exists {
|
|
continue
|
|
}
|
|
delete(need, t)
|
|
c14n[t] = fmt.Sprintf("_:c14n%d", i)
|
|
}
|
|
}
|
|
|
|
if len(need) != 0 {
|
|
return dst, fmt.Errorf("rdf: missing term hashes for %d terms", len(need))
|
|
}
|
|
|
|
for i, s := range src {
|
|
if dst[i] == nil {
|
|
dst[i] = &Statement{}
|
|
}
|
|
n := dst[i]
|
|
n.Subject = Term{Value: translate(s.Subject.Value, c14n)}
|
|
n.Predicate = s.Predicate
|
|
n.Object = Term{Value: translate(s.Object.Value, c14n)}
|
|
n.Label = Term{Value: translate(s.Label.Value, c14n)}
|
|
}
|
|
sortC14nStatements(dst)
|
|
|
|
return dst, nil
|
|
}
|
|
|
|
func translate(term string, mapping map[string]string) string {
|
|
if term, ok := mapping[term]; ok {
|
|
return term
|
|
}
|
|
return term
|
|
}
|
|
|
|
func sortC14nStatements(statements []*Statement) {
|
|
slices.SortFunc(statements, func(a, b *Statement) int {
|
|
if n := cmp.Compare(a.Subject.Value, b.Subject.Value); n != 0 {
|
|
return n
|
|
}
|
|
|
|
// Always IRI.
|
|
if n := cmp.Compare(a.Predicate.Value, b.Predicate.Value); n != 0 {
|
|
return n
|
|
}
|
|
|
|
return cmp.Compare(a.Object.Value, b.Object.Value)
|
|
})
|
|
}
|
|
|
|
// hashBNodes returns the hashed blank nodes of the graph described by statements
|
|
// using the provided hash function. Hashes are initialised with zero.
|
|
//
|
|
// This is algorithm 1 in doi:10.1145/3068333.
|
|
func hashBNodes(statements []*Statement, h hash.Hash, zero []byte, hash0 map[string][]byte) (hash *table, disjoint bool) {
|
|
curr := newTable()
|
|
for _, s := range statements {
|
|
for i, t := range []string{
|
|
s.Subject.Value,
|
|
s.Predicate.Value,
|
|
s.Object.Value,
|
|
s.Label.Value,
|
|
} {
|
|
switch {
|
|
case i == 3 && t == "":
|
|
continue
|
|
case isBlank(t):
|
|
if hash0 == nil {
|
|
curr.set(t, zero)
|
|
} else {
|
|
curr.set(t, hash0[t])
|
|
}
|
|
case isIRI(t):
|
|
h.Reset()
|
|
h.Write([]byte(t[1 : len(t)-1]))
|
|
curr.set(t, h.Sum(nil))
|
|
default:
|
|
h.Reset()
|
|
h.Write([]byte(t))
|
|
curr.set(t, h.Sum(nil))
|
|
}
|
|
}
|
|
}
|
|
|
|
bag := newHashBag(h, curr)
|
|
last := curr.clone()
|
|
for {
|
|
curr, last = last, curr
|
|
for _, s := range statements {
|
|
if isBlank(s.Subject.Value) {
|
|
var lab []byte
|
|
if s.Label.Value != "" {
|
|
lab = last.hashOf[s.Label.Value]
|
|
}
|
|
c := hashTuple(h, last.hashOf[s.Object.Value], last.hashOf[s.Predicate.Value], lab, []byte{'+'})
|
|
bag.add(s.Subject.Value, c)
|
|
}
|
|
|
|
if isBlank(s.Object.Value) {
|
|
var lab []byte
|
|
if s.Label.Value != "" {
|
|
lab = last.hashOf[s.Label.Value]
|
|
}
|
|
c := hashTuple(h, last.hashOf[s.Subject.Value], last.hashOf[s.Predicate.Value], lab, []byte{'-'})
|
|
bag.add(s.Object.Value, c)
|
|
}
|
|
|
|
// This and the lab value above implement the label hashing
|
|
// required for RDF dataset hashing as described in
|
|
// https://doi.org/10.5281/zenodo.3154322 v1.0
|
|
// Readme.md#adaptation-of-the-algorithms-to-handle-datasets.
|
|
if isBlank(s.Label.Value) {
|
|
c := hashTuple(h, last.hashOf[s.Subject.Value], last.hashOf[s.Predicate.Value], last.hashOf[s.Object.Value], []byte{'.'})
|
|
bag.add(s.Label.Value, c)
|
|
}
|
|
}
|
|
|
|
for t := range bag.hashesFor {
|
|
curr.set(t, bag.sum(t))
|
|
}
|
|
|
|
disjoint = curr.allUnique()
|
|
if disjoint || !curr.changedFrom(last) {
|
|
return curr, disjoint
|
|
}
|
|
}
|
|
}
|
|
|
|
// table is a collision aware hash collection for RDF terms.
|
|
type table struct {
|
|
// hashOf holds the hash for each term.
|
|
hashOf map[string][]byte
|
|
// termsFor holds the set of nodes in
|
|
// the second key for terms that share
|
|
// the hash in the first key.
|
|
termsFor map[string]map[string]bool
|
|
|
|
// isBlank and blanks are the set of blank
|
|
// nodes.
|
|
// isBlank is nil for cloned tables.
|
|
isBlank map[string]bool
|
|
// blanks is nil for tables created
|
|
// with newTable.
|
|
blanks []string
|
|
}
|
|
|
|
// newTable returns a new hash table.
|
|
func newTable() *table {
|
|
return &table{
|
|
hashOf: make(map[string][]byte),
|
|
termsFor: make(map[string]map[string]bool),
|
|
isBlank: make(map[string]bool),
|
|
}
|
|
}
|
|
|
|
// wasCloned returns whether t is a parent or child of a cloning operation.
|
|
func (t *table) wasCloned() bool { return t.isBlank == nil }
|
|
|
|
// isNew returns whether t is a new table.
|
|
func (t *table) isNew() bool { return t.blanks == nil }
|
|
|
|
// clone returns a clone of the receiver.
|
|
func (t *table) clone() *table {
|
|
new := &table{
|
|
hashOf: make(map[string][]byte),
|
|
termsFor: make(map[string]map[string]bool),
|
|
}
|
|
for term, hash := range t.hashOf {
|
|
new.hashOf[term] = hash
|
|
}
|
|
for hash, coll := range t.termsFor {
|
|
if len(coll) == 0 {
|
|
continue
|
|
}
|
|
terms := make(map[string]bool)
|
|
for term := range coll {
|
|
terms[term] = true
|
|
}
|
|
new.termsFor[hash] = terms
|
|
}
|
|
if t.isNew() {
|
|
t.blanks = make([]string, len(t.isBlank))
|
|
i := 0
|
|
for n := range t.isBlank {
|
|
t.blanks[i] = n
|
|
i++
|
|
}
|
|
t.isBlank = nil
|
|
}
|
|
new.blanks = t.blanks
|
|
return new
|
|
}
|
|
|
|
// TODO(kortschak): Make hash table in table.hashOf reuse the []byte on update.
|
|
// This is not trivial since we need to check for changes, so we can't just get
|
|
// the current hash buffer and write into it. So if this is done we probably
|
|
// a pair of buffers, a current and a waiting.
|
|
|
|
// set sets the hash of the term, removing any previously set hash.
|
|
func (t *table) set(term string, hash []byte) {
|
|
prev := t.hashOf[term]
|
|
if bytes.Equal(prev, hash) {
|
|
return
|
|
}
|
|
t.hashOf[term] = hash
|
|
|
|
// Delete any existing hashes for this term.
|
|
switch terms := t.termsFor[string(prev)]; {
|
|
case len(terms) == 1:
|
|
delete(t.termsFor, string(prev))
|
|
case len(terms) > 1:
|
|
delete(terms, term)
|
|
}
|
|
|
|
terms, ok := t.termsFor[string(hash)]
|
|
if ok {
|
|
terms[term] = true
|
|
} else {
|
|
t.termsFor[string(hash)] = map[string]bool{term: true}
|
|
}
|
|
|
|
if !t.wasCloned() && isBlank(term) {
|
|
// We are in the original table, so note
|
|
// any blank node label that we see.
|
|
t.isBlank[term] = true
|
|
}
|
|
}
|
|
|
|
// allUnique returns whether every term has an unique hash. allUnique
|
|
// can only be called on a table that was returned by clone.
|
|
func (t *table) allUnique() bool {
|
|
if t.isNew() {
|
|
panic("checked hash bag from uncloned table")
|
|
}
|
|
for _, term := range t.blanks {
|
|
if len(t.termsFor[string(t.hashOf[term])]) > 1 {
|
|
return false
|
|
}
|
|
}
|
|
return true
|
|
}
|
|
|
|
// changedFrom returns whether the receiver has been updated from last.
|
|
// changedFrom can only be called on a table that was returned by clone.
|
|
func (t *table) changedFrom(last *table) bool {
|
|
if t.isNew() {
|
|
panic("checked hash bag from uncloned table")
|
|
}
|
|
for i, x := range t.blanks {
|
|
for _, y := range t.blanks[i+1:] {
|
|
if bytes.Equal(t.hashOf[x], t.hashOf[y]) != bytes.Equal(last.hashOf[x], last.hashOf[y]) {
|
|
return true
|
|
}
|
|
}
|
|
}
|
|
return false
|
|
}
|
|
|
|
// hashBag implements a commutative and associative hash.
|
|
// See notes in https://doi.org/10.5281/zenodo.3154322 v1.0
|
|
// Readme.md#what-is-the-precise-specification-of-hashbag.
|
|
type hashBag struct {
|
|
hash hash.Hash
|
|
hashesFor map[string][][]byte
|
|
}
|
|
|
|
// newHashBag returns a new hashBag using the provided hash function for
|
|
// the given hash table. newHashBag can only take a table parameter that
|
|
// was returned by newTable.
|
|
func newHashBag(h hash.Hash, t *table) hashBag {
|
|
if t.wasCloned() {
|
|
panic("made hash bag from cloned table")
|
|
}
|
|
b := hashBag{hash: h, hashesFor: make(map[string][][]byte, len(t.isBlank))}
|
|
for n := range t.isBlank {
|
|
b.hashesFor[n] = [][]byte{t.hashOf[n]}
|
|
}
|
|
return b
|
|
}
|
|
|
|
// add adds the hash to the hash bag for the term.
|
|
func (b hashBag) add(term string, hash []byte) {
|
|
b.hashesFor[term] = append(b.hashesFor[term], hash)
|
|
}
|
|
|
|
// sum calculates the hash sum for the given term, updates the hash bag
|
|
// state and returns the hash.
|
|
func (b hashBag) sum(term string) []byte {
|
|
p := b.hashesFor[term]
|
|
order.BySliceValues(p)
|
|
h := hashTuple(b.hash, p...)
|
|
b.hashesFor[term] = b.hashesFor[term][:1]
|
|
b.hashesFor[term][0] = h
|
|
return h
|
|
}
|
|
|
|
// hashTuple returns the h hash of the concatenation of t.
|
|
func hashTuple(h hash.Hash, t ...[]byte) []byte {
|
|
h.Reset()
|
|
for _, b := range t {
|
|
h.Write(b)
|
|
}
|
|
return h.Sum(nil)
|
|
}
|
|
|
|
// hashBNodesPerSplit returns the independently hashed blank nodes of the
|
|
// graph described by statements using the provided hash function. Hashes
|
|
// are initialised with zero.
|
|
//
|
|
// This is algorithm 2 in doi:10.1145/3068333.
|
|
func hashBNodesPerSplit(statements []*Statement, decomp bool, h hash.Hash, zero []byte) (hash *table, parts byLengthHash, disjoint bool) {
|
|
if !decomp {
|
|
hash, ok := hashBNodes(statements, h, zero, nil)
|
|
parts = appendOrdered(byLengthHash{}, hash.termsFor)
|
|
sort.Sort(parts)
|
|
return hash, parts, ok
|
|
}
|
|
|
|
splits := split(statements)
|
|
|
|
// Avoid recombination work if there is only one split.
|
|
if len(splits) == 1 {
|
|
hash, ok := hashBNodes(statements, h, zero, nil)
|
|
parts = appendOrdered(byLengthHash{}, hash.termsFor)
|
|
sort.Sort(parts)
|
|
return hash, parts, ok
|
|
}
|
|
|
|
hash = &table{hashOf: make(map[string][]byte)}
|
|
disjoint = true
|
|
for _, g := range splits {
|
|
part, ok := hashBNodes(g, h, zero, nil)
|
|
// Each split is guaranteed to be disjoint in its
|
|
// set of blank nodes, so we can just append to our
|
|
// collection of blanks.
|
|
hash.blanks = append(hash.blanks, part.blanks...)
|
|
if !ok {
|
|
// Allow a short-circuit of the allUnique check.
|
|
disjoint = false
|
|
}
|
|
for k, v := range part.hashOf {
|
|
hash.hashOf[k] = v
|
|
}
|
|
parts = appendOrdered(parts, part.termsFor)
|
|
}
|
|
sort.Sort(parts)
|
|
return hash, parts, disjoint && allUnique(hash.hashOf)
|
|
}
|
|
|
|
// appendOrdered adds parts (labels stored in the second key) for each
|
|
// hash (stored in the first key) to parts.
|
|
func appendOrdered(parts byLengthHash, partSets map[string]map[string]bool) byLengthHash {
|
|
for h, s := range partSets {
|
|
var p []string
|
|
for e := range s {
|
|
if isBlank(e) {
|
|
p = append(p, e)
|
|
}
|
|
}
|
|
if p != nil {
|
|
parts.nodes = append(parts.nodes, p)
|
|
parts.hashes = append(parts.hashes, h)
|
|
}
|
|
}
|
|
return parts
|
|
}
|
|
|
|
// byLengthHash implements ascending length sort of a set of blank RDF
|
|
// term partitions with ties broken by lexical ordering of the partitions'
|
|
// hashes.
|
|
type byLengthHash struct {
|
|
// nodes holds the blank nodes of a part.
|
|
nodes [][]string
|
|
// hashes holds the hashes corresponding
|
|
// to the nodes in the nodes field, using
|
|
// the same index.
|
|
hashes []string
|
|
}
|
|
|
|
func (s byLengthHash) Len() int { return len(s.nodes) }
|
|
func (s byLengthHash) Less(i, j int) bool {
|
|
switch {
|
|
case len(s.nodes[i]) < len(s.nodes[j]):
|
|
return true
|
|
case len(s.nodes[i]) > len(s.nodes[j]):
|
|
return false
|
|
}
|
|
return s.hashes[i] < s.hashes[j]
|
|
}
|
|
func (s byLengthHash) Swap(i, j int) {
|
|
s.nodes[i], s.nodes[j] = s.nodes[j], s.nodes[i]
|
|
s.hashes[i], s.hashes[j] = s.hashes[j], s.hashes[i]
|
|
}
|
|
|
|
// allUnique returns whether the []byte hash values in hashes are all unique.
|
|
func allUnique(hashes map[string][]byte) bool {
|
|
set := make(map[string]bool)
|
|
for _, h := range hashes {
|
|
if set[string(h)] {
|
|
return false
|
|
}
|
|
set[string(h)] = true
|
|
}
|
|
return true
|
|
}
|
|
|
|
// split returns the statements forming connected components in the graph
|
|
// described by statements.
|
|
//
|
|
// This is split in algorithm 2 in doi:10.1145/3068333.
|
|
func split(statements []*Statement) [][]*Statement {
|
|
ds := make(djSet)
|
|
for _, s := range statements {
|
|
ds.add(s.Subject.Value)
|
|
ds.add(s.Object.Value)
|
|
if isBlank(s.Subject.Value) && isBlank(s.Object.Value) {
|
|
ds.union(ds.find(s.Subject.Value), ds.find(s.Object.Value))
|
|
}
|
|
}
|
|
|
|
var (
|
|
splits [][]*Statement
|
|
ground []*Statement
|
|
)
|
|
idxOf := make(map[*dsNode]int)
|
|
for _, s := range statements {
|
|
var t string
|
|
switch {
|
|
case isBlank(s.Subject.Value):
|
|
t = s.Subject.Value
|
|
case isBlank(s.Object.Value):
|
|
t = s.Object.Value
|
|
default:
|
|
ground = append(ground, s)
|
|
continue
|
|
}
|
|
r := ds.find(t)
|
|
if r == nil {
|
|
panic(fmt.Sprintf("term not found: %q", t))
|
|
}
|
|
i, ok := idxOf[r]
|
|
if !ok {
|
|
i = len(splits)
|
|
idxOf[r] = i
|
|
splits = append(splits, []*Statement{s})
|
|
} else {
|
|
splits[i] = append(splits[i], s)
|
|
}
|
|
}
|
|
if ground != nil {
|
|
splits = append(splits, ground)
|
|
}
|
|
|
|
if debug {
|
|
debug.log(0, "Splits:")
|
|
for i, s := range splits {
|
|
for j, t := range s {
|
|
if j == 0 {
|
|
debug.logf(0, "%d.\t%s\n", i+1, t)
|
|
} else {
|
|
debug.logf(0, "\t%s\n", t)
|
|
}
|
|
}
|
|
debug.log(0)
|
|
}
|
|
}
|
|
|
|
return splits
|
|
}
|
|
|
|
// distinguish returns G⊥: smallest hash-labelled graph found thus far.
|
|
// The graph is returned as a node to hash lookup.
|
|
//
|
|
// This is part of algorithm 3 in doi:10.1145/3068333.
|
|
//
|
|
// The correspondence between the parameters for the function in the paper
|
|
// with the implementation here is as follows:
|
|
// - G = statements
|
|
// - hash = hash
|
|
// - P = parts (already sorted by hashBNodesPerSplit)
|
|
// - G⊥ = lowest
|
|
// - B = hash.blanks
|
|
//
|
|
// The additional parameter dist specifies that distinguish should treat
|
|
// coequal trivial parts as a coarse of intermediate part and distinguish
|
|
// the nodes in that merged part.
|
|
func distinguish(statements []*Statement, dist bool, h hash.Hash, zero []byte, hash *table, parts byLengthHash, lowest map[string][]byte, depth int) map[string][]byte {
|
|
if debug {
|
|
debug.log(depth, "Running Distinguish")
|
|
}
|
|
|
|
var small []string
|
|
var k int
|
|
for k, small = range parts.nodes {
|
|
if len(small) > 1 {
|
|
break
|
|
}
|
|
}
|
|
if len(small) < 2 {
|
|
if lowest != nil || !dist {
|
|
if debug {
|
|
debug.log(depth, "Return lowest (no non-trivial parts):")
|
|
debug.logHashes(depth, lowest, h.Size())
|
|
}
|
|
|
|
return lowest
|
|
}
|
|
|
|
// We have been given a set of fine parts,
|
|
// but to reach here they must have been
|
|
// non-uniquely labeled, so treat them
|
|
// as a single coarse part.
|
|
k, small = 0, parts.nodes[0]
|
|
}
|
|
|
|
if debug {
|
|
debug.logf(depth, "Part: %v %x\n\n", small, parts.hashes[k])
|
|
debug.log(depth, "Orig hash:")
|
|
debug.logHashes(depth, hash.hashOf, h.Size())
|
|
}
|
|
|
|
smallHash := hash.hashOf[small[0]]
|
|
for _, p := range parts.nodes[k:] {
|
|
if !bytes.Equal(smallHash, hash.hashOf[p[0]]) {
|
|
|
|
if debug {
|
|
debug.logf(depth, "End of co-equal hashes: %x != %x\n\n", smallHash, hash.hashOf[p[0]])
|
|
}
|
|
|
|
break
|
|
}
|
|
for i, b := range p {
|
|
|
|
if debug {
|
|
debug.logf(depth, "Iter: %d — B = %q\n\n", i, b)
|
|
|
|
if depth == 0 {
|
|
debug.log(depth, "Current lowest:\n")
|
|
debug.logHashes(depth, lowest, h.Size())
|
|
}
|
|
}
|
|
|
|
hashP := hash.clone()
|
|
hashP.set(b, hashTuple(h, hashP.hashOf[b], []byte{'@'}))
|
|
hashPP, ok := hashBNodes(statements, h, zero, hashP.hashOf)
|
|
if ok {
|
|
|
|
if debug {
|
|
debug.log(depth, "hashPP is trivial")
|
|
debug.log(depth, "comparing hashPP\n")
|
|
debug.logHashes(depth, hashPP.hashOf, h.Size())
|
|
debug.log(depth, "with previous\n")
|
|
debug.logHashes(depth, lowest, h.Size())
|
|
}
|
|
|
|
if lowest == nil || graphLess(statements, hashPP.hashOf, lowest) {
|
|
lowest = hashPP.hashOf
|
|
debug.log(depth, "choose hashPP\n")
|
|
}
|
|
} else {
|
|
partsP := appendOrdered(byLengthHash{}, hashPP.termsFor)
|
|
sort.Sort(partsP)
|
|
|
|
if debug {
|
|
debug.log(depth, "Parts':")
|
|
debug.logParts(depth, partsP)
|
|
debug.log(depth, "Recursive distinguish")
|
|
debug.log(depth, "Called with current lowest:\n")
|
|
debug.logHashes(depth, lowest, h.Size())
|
|
}
|
|
|
|
lowest = distinguish(statements, dist, h, zero, hashPP, partsP, lowest, depth+1)
|
|
}
|
|
}
|
|
}
|
|
|
|
if debug {
|
|
debug.log(depth, "Return lowest:")
|
|
debug.logHashes(depth, lowest, h.Size())
|
|
}
|
|
|
|
return lowest
|
|
}
|
|
|
|
// terms ordered syntactically, triples ordered lexicographically, and graphs
|
|
// ordered such that G < H if and only if G ⊂ H or there exists a triple
|
|
// t ∈ G \ H such that no triple t' ∈ H \ G exists where t' < t.
|
|
// p9 https://doi.org/10.1145/3068333
|
|
func graphLess(statements []*Statement, a, b map[string][]byte) bool {
|
|
g := newLexicalStatements(statements, a)
|
|
sort.Sort(g)
|
|
h := newLexicalStatements(statements, b)
|
|
sort.Sort(h)
|
|
|
|
gSubH := sub(g, h, len(g.statements))
|
|
if len(gSubH) == 0 {
|
|
return true
|
|
}
|
|
|
|
hSubG := sub(h, g, 1)
|
|
if len(hSubG) == 0 {
|
|
return true
|
|
}
|
|
lowestH := relabeledStatement{hSubG[0], h.hashes}
|
|
|
|
for _, s := range gSubH {
|
|
rs := relabeledStatement{s, g.hashes}
|
|
if rs.less(lowestH) {
|
|
return true
|
|
}
|
|
}
|
|
return false
|
|
}
|
|
|
|
// lexicalStatements is a sort implementation for Statements with blank
|
|
// node labels replaced with their hash.
|
|
type lexicalStatements struct {
|
|
statements []*Statement
|
|
hashes map[string][]byte
|
|
}
|
|
|
|
func newLexicalStatements(statements []*Statement, hash map[string][]byte) lexicalStatements {
|
|
s := lexicalStatements{
|
|
statements: make([]*Statement, len(statements)),
|
|
hashes: hash,
|
|
}
|
|
copy(s.statements, statements)
|
|
return s
|
|
}
|
|
|
|
// sub returns the difference between a and b up to max elements long.
|
|
func sub(a, b lexicalStatements, max int) []*Statement {
|
|
var d []*Statement
|
|
var i, j int
|
|
for i < len(a.statements) && j < len(b.statements) && len(d) < max {
|
|
ra := relabeledStatement{a.statements[i], a.hashes}
|
|
rb := relabeledStatement{b.statements[j], b.hashes}
|
|
switch {
|
|
case ra.less(rb):
|
|
d = append(d, a.statements[i])
|
|
i++
|
|
case rb.less(ra):
|
|
j++
|
|
default:
|
|
i++
|
|
}
|
|
}
|
|
if len(d) < max {
|
|
d = append(d, a.statements[i:min(len(a.statements), i+max-len(d))]...)
|
|
}
|
|
return d
|
|
}
|
|
|
|
func (s lexicalStatements) Len() int { return len(s.statements) }
|
|
func (s lexicalStatements) Less(i, j int) bool {
|
|
return relabeledStatement{s.statements[i], s.hashes}.less(relabeledStatement{s.statements[j], s.hashes})
|
|
}
|
|
func (s lexicalStatements) Swap(i, j int) {
|
|
s.statements[i], s.statements[j] = s.statements[j], s.statements[i]
|
|
}
|
|
|
|
// relabeledStatement is a statement that is orderable by its blank node
|
|
// hash relabeling.
|
|
type relabeledStatement struct {
|
|
statement *Statement
|
|
labels map[string][]byte
|
|
}
|
|
|
|
func (a relabeledStatement) less(b relabeledStatement) bool {
|
|
switch {
|
|
case relabeledTerm{a.statement.Subject, a.labels}.less(relabeledTerm{b.statement.Subject, b.labels}):
|
|
return true
|
|
case relabeledTerm{b.statement.Subject, b.labels}.less(relabeledTerm{a.statement.Subject, a.labels}):
|
|
return false
|
|
}
|
|
switch { // Always IRI.
|
|
case a.statement.Predicate.Value < b.statement.Predicate.Value:
|
|
return true
|
|
case a.statement.Predicate.Value > b.statement.Predicate.Value:
|
|
return false
|
|
}
|
|
switch {
|
|
case relabeledTerm{a.statement.Object, a.labels}.less(relabeledTerm{b.statement.Object, b.labels}):
|
|
return true
|
|
case relabeledTerm{b.statement.Object, b.labels}.less(relabeledTerm{a.statement.Object, a.labels}):
|
|
return false
|
|
}
|
|
return relabeledTerm{a.statement.Label, a.labels}.less(relabeledTerm{b.statement.Label, b.labels})
|
|
}
|
|
|
|
func (s relabeledStatement) String() string {
|
|
subj := relabeledTerm{term: s.statement.Subject, labels: s.labels}
|
|
obj := relabeledTerm{term: s.statement.Object, labels: s.labels}
|
|
if s.statement.Label.Value == "" {
|
|
return fmt.Sprintf("%s %s %s .", subj, s.statement.Predicate.Value, obj)
|
|
}
|
|
lab := relabeledTerm{term: s.statement.Label, labels: s.labels}
|
|
return fmt.Sprintf("%s %s %s %s .", subj, s.statement.Predicate.Value, obj, lab)
|
|
}
|
|
|
|
// relabeledTerm is a term that is orderable by its blank node hash relabeling.
|
|
type relabeledTerm struct {
|
|
term Term
|
|
labels map[string][]byte
|
|
}
|
|
|
|
func (a relabeledTerm) less(b relabeledTerm) bool {
|
|
aIsBlank := isBlank(a.term.Value)
|
|
bIsBlank := isBlank(b.term.Value)
|
|
switch {
|
|
case aIsBlank && bIsBlank:
|
|
return bytes.Compare(a.labels[a.term.Value], b.labels[b.term.Value]) < 0
|
|
case aIsBlank:
|
|
return blankPrefix < unquoteIRI(b.term.Value)
|
|
case bIsBlank:
|
|
return unquoteIRI(a.term.Value) < blankPrefix
|
|
default:
|
|
return unquoteIRI(a.term.Value) < unquoteIRI(b.term.Value)
|
|
}
|
|
}
|
|
|
|
func unquoteIRI(s string) string {
|
|
if len(s) > 1 && s[0] == '<' && s[len(s)-1] == '>' {
|
|
s = s[1 : len(s)-1]
|
|
}
|
|
return s
|
|
}
|
|
|
|
func (t relabeledTerm) String() string {
|
|
if !isBlank(t.term.Value) {
|
|
return t.term.Value
|
|
}
|
|
h, ok := t.labels[t.term.Value]
|
|
if !ok {
|
|
return t.term.Value + "_missing_hash"
|
|
}
|
|
return fmt.Sprintf("_:%0x", h)
|
|
}
|