mirror of
https://github.com/gonum/gonum.git
synced 2025-09-27 11:32:32 +08:00
576 lines
16 KiB
Go
576 lines
16 KiB
Go
// Copyright ©2020 The Gonum Authors. All rights reserved.
|
|
// Use of this source code is governed by a BSD-style
|
|
// license that can be found in the LICENSE file.
|
|
|
|
package rdf
|
|
|
|
import (
|
|
"bytes"
|
|
"crypto/sha1"
|
|
"crypto/sha256"
|
|
"errors"
|
|
"fmt"
|
|
"hash"
|
|
"slices"
|
|
|
|
"gonum.org/v1/gonum/stat/combin"
|
|
)
|
|
|
|
// Deduplicate removes duplicate statements in s, working in place, and returns
|
|
// the deduplicated slice with statements sorted in lexical order. Term UID
|
|
// fields are not considered and their values may be lost during deduplication.
|
|
func Deduplicate(s []*Statement) []*Statement {
|
|
if len(s) < 2 {
|
|
return s
|
|
}
|
|
sortC14nStatements(s)
|
|
curr := 0
|
|
for i, e := range s {
|
|
if isSameStatement(e, s[curr]) {
|
|
continue
|
|
}
|
|
curr++
|
|
if curr < i {
|
|
s[curr], s[i] = s[i], nil
|
|
}
|
|
}
|
|
return s[:curr+1]
|
|
}
|
|
|
|
func isSameStatement(a, b *Statement) bool {
|
|
if a == b {
|
|
return true
|
|
}
|
|
return a.Subject.Value == b.Subject.Value &&
|
|
a.Predicate.Value == b.Predicate.Value &&
|
|
a.Object.Value == b.Object.Value &&
|
|
a.Label.Value == b.Label.Value
|
|
}
|
|
|
|
// Note on implementation details: The comment numbering in the code relates the
|
|
// implementation to the steps of the algorithm described in the specification.
|
|
|
|
// URGNA2012 applies the Universal RDF Graph Normalization Algorithm 2012
|
|
// to the statements in src, placing the result in dst and returning it.
|
|
// If dst is nil a slice of statements will be allocated. If dst is not
|
|
// nil and not the same length as src, URGNA2012 will return an error.
|
|
//
|
|
// See https://json-ld.github.io/rdf-dataset-canonicalization/spec/index.html for details.
|
|
func URGNA2012(dst, src []*Statement) ([]*Statement, error) {
|
|
if dst == nil {
|
|
dst = make([]*Statement, len(src))
|
|
} else if len(dst) != len(src) {
|
|
return dst, errors.New("rdf: slice length mismatch")
|
|
}
|
|
// 1. https://json-ld.github.io/rdf-dataset-canonicalization/spec/index.html#algorithm
|
|
u := &urna{
|
|
canon: newIssuer("_:c14n"),
|
|
hashes: make(map[string]string),
|
|
statementsFor: make(map[string][]*Statement),
|
|
hash: sha1.New(),
|
|
label: "_:g",
|
|
}
|
|
u.hashToRelated = u.hashToRelatedURGNA2012
|
|
return u.relabel(dst, src)
|
|
}
|
|
|
|
// URDNA2015 applies the Universal RDF Dataset Normalization Algorithm 2015
|
|
// to the statements in src, placing the result in dst and returning it.
|
|
// If dst is nil a slice of statements will be allocated. If dst is not
|
|
// nil and not the same length as src, URDNA2015 will return an error.
|
|
//
|
|
// See https://json-ld.github.io/rdf-dataset-canonicalization/spec/index.html for details.
|
|
func URDNA2015(dst, src []*Statement) ([]*Statement, error) {
|
|
if dst == nil {
|
|
dst = make([]*Statement, len(src))
|
|
} else if len(dst) != len(src) {
|
|
return dst, errors.New("rdf: slice length mismatch")
|
|
}
|
|
// 1. https://json-ld.github.io/rdf-dataset-canonicalization/spec/index.html#algorithm
|
|
u := &urna{
|
|
canon: newIssuer("_:c14n"),
|
|
hashes: make(map[string]string),
|
|
statementsFor: make(map[string][]*Statement),
|
|
hash: sha256.New(),
|
|
}
|
|
u.hashToRelated = u.hashToRelatedURDNA2015
|
|
return u.relabel(dst, src)
|
|
}
|
|
|
|
// urna is the canonicalization state for the URGNA2012 and URDNA2015
|
|
// algorithms. The urna type implements both algorithms through the state
|
|
// of the label and hashToRelated fields.
|
|
//
|
|
// See https://json-ld.github.io/rdf-dataset-canonicalization/spec/index.html#canonicalization-state
|
|
// for details.
|
|
type urna struct {
|
|
// canon is the canonical issuer.
|
|
canon *issuer
|
|
|
|
// hashes holds already calculated hashes
|
|
// for hashing first degree quads.
|
|
hashes map[string]string
|
|
|
|
// statementsFor is the blank node to quads map.
|
|
statementsFor map[string][]*Statement
|
|
|
|
// hash is the hash function used by the
|
|
// canonicalization function.
|
|
hash hash.Hash
|
|
// hashToRelated holds URGNA2012 and URDNA2015-
|
|
// specific hashing routines.
|
|
hashToRelated relatedHashCreator
|
|
// label holds "_:g" when running URGNA2012.
|
|
// Otherwise it is empty.
|
|
label string
|
|
}
|
|
|
|
// relabel is the algorithm described in section 4.4.2 of the spec at
|
|
// https://json-ld.github.io/rdf-dataset-canonicalization/spec/index.html#algorithm.
|
|
func (u *urna) relabel(dst, src []*Statement) ([]*Statement, error) {
|
|
// termsFor is the hash to blank nodes map.
|
|
// It is not held in the urna struct, but is
|
|
// part of the canonicalization state.
|
|
//
|
|
// https://json-ld.github.io/rdf-dataset-canonicalization/spec/index.html#dfn-hash-to-blank-nodes-map
|
|
var termsFor map[string][]string // 1.
|
|
|
|
for _, s := range src { // 2.
|
|
terms:
|
|
for _, t := range []string{
|
|
s.Subject.Value,
|
|
s.Object.Value,
|
|
s.Label.Value,
|
|
} {
|
|
if !isBlank(t) {
|
|
continue
|
|
}
|
|
for _, e := range u.statementsFor[t] {
|
|
if e == s {
|
|
continue terms
|
|
}
|
|
}
|
|
u.statementsFor[t] = append(u.statementsFor[t], s)
|
|
}
|
|
}
|
|
|
|
// todo is the list of non-normalized blank node identifiers.
|
|
todo := make(map[string]bool) // 3.
|
|
for b := range u.statementsFor {
|
|
todo[b] = true
|
|
}
|
|
|
|
simple := true // 4.
|
|
for simple { // 5.
|
|
simple = false // 5.1
|
|
|
|
termsFor = make(map[string][]string) // 5.2
|
|
|
|
for b := range todo { // 5.3
|
|
hash := u.hashFirstDegreeQuads(b) // 5.3.1
|
|
termsFor[hash] = append(termsFor[hash], b) // 5.3.2
|
|
}
|
|
|
|
for _, h := range lexicallySortedTermHashes(termsFor) { // 5.4
|
|
terms := termsFor[h]
|
|
if len(terms) > 1 { // 5.4.1
|
|
continue
|
|
}
|
|
u.canon.issueFor(terms[0]) // 5.4.2
|
|
delete(todo, terms[0]) // 5.4.3
|
|
delete(termsFor, h) // 5.4.4
|
|
simple = true // 5.4.5
|
|
}
|
|
}
|
|
|
|
for _, hash := range lexicallySortedTermHashes(termsFor) { // 6.
|
|
paths := make(map[string][]*issuer) // 6.1
|
|
for _, b := range termsFor[hash] { // 6.2
|
|
if u.canon.has(b) { // 6.2.1
|
|
continue
|
|
}
|
|
names := newIssuer("_:b") // 6.2.2
|
|
names.issueFor(b) // 6.2.3
|
|
|
|
// 6.2.4
|
|
hash, issuer := u.hashNDegreeQuads(b, names)
|
|
paths[string(hash)] = append(paths[string(hash)], issuer)
|
|
}
|
|
|
|
for _, hash := range lexicallySortedPathHashes(paths) { // 6.3
|
|
for _, i := range paths[hash] {
|
|
for _, existing := range i.ordered { // 6.3.1
|
|
u.canon.issueFor(existing)
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
// 7.
|
|
for i, s := range src {
|
|
if dst[i] == nil {
|
|
dst[i] = &Statement{}
|
|
}
|
|
n := dst[i]
|
|
n.Subject = Term{Value: translateURNA(s.Subject.Value, u.canon.issued), UID: s.Subject.UID}
|
|
n.Predicate = s.Predicate
|
|
n.Object = Term{Value: translateURNA(s.Object.Value, u.canon.issued), UID: s.Object.UID}
|
|
n.Label = Term{Value: translateURNA(s.Label.Value, u.canon.issued), UID: s.Label.UID}
|
|
}
|
|
sortC14nStatements(dst)
|
|
|
|
return dst, nil
|
|
}
|
|
|
|
// lexicallySortedPathHashes returns the lexically sorted hashes of paths.
|
|
func lexicallySortedPathHashes(paths map[string][]*issuer) []string {
|
|
lexicalHashPaths := make([]string, len(paths))
|
|
i := 0
|
|
for h := range paths {
|
|
lexicalHashPaths[i] = h
|
|
i++
|
|
}
|
|
slices.Sort(lexicalHashPaths)
|
|
return lexicalHashPaths
|
|
}
|
|
|
|
func translateURNA(term string, mapping map[string]string) string {
|
|
term = translate(term, mapping)
|
|
if term == "" {
|
|
return ""
|
|
}
|
|
text, qual, kind, err := extract([]rune(term))
|
|
var t Term
|
|
switch kind {
|
|
case Blank:
|
|
return term
|
|
case IRI:
|
|
t, err = NewIRITerm(text)
|
|
case Literal:
|
|
t, err = NewLiteralTerm(text, qual)
|
|
}
|
|
if err != nil {
|
|
panic(fmt.Errorf("rdf: invalid term %q: %w", term, err))
|
|
}
|
|
return t.Value
|
|
}
|
|
|
|
// hashFirstDegreeQuads is the algorithm described in section 4.6 of the spec
|
|
// at https://json-ld.github.io/rdf-dataset-canonicalization/spec/index.html#algorithm-1.
|
|
func (u *urna) hashFirstDegreeQuads(b string) string {
|
|
if h, ok := u.hashes[b]; ok {
|
|
return h
|
|
}
|
|
|
|
var statements []*Statement // 1.
|
|
|
|
for _, s := range u.statementsFor[b] { // 2. and 3.
|
|
var n Statement
|
|
n.Subject.Value = replaceBlank(s.Subject.Value, b, "")
|
|
n.Predicate.Value = s.Predicate.Value
|
|
n.Object.Value = replaceBlank(s.Object.Value, b, "")
|
|
n.Label.Value = replaceBlank(s.Label.Value, b, u.label)
|
|
statements = append(statements, &n)
|
|
}
|
|
|
|
sortC14nStatements(statements) // 4.
|
|
|
|
// 5.
|
|
u.hash.Reset()
|
|
for _, s := range statements {
|
|
fmt.Fprintln(u.hash, s)
|
|
}
|
|
u.hashes[b] = string(hex(u.hash.Sum(nil)))
|
|
|
|
return u.hashes[b]
|
|
}
|
|
|
|
// replaceBlank implements 3.1 of the algorithm described at
|
|
// https://json-ld.github.io/rdf-dataset-canonicalization/spec/index.html#algorithm-1.
|
|
func replaceBlank(b, matching, label string) string {
|
|
if !isBlank(b) { // 3.1
|
|
return b
|
|
}
|
|
if label != "" { // URGNA2012 modification.
|
|
// When running in URGNA2012 mode, label is "_:g" for Label fields.
|
|
//
|
|
// If any blank node was used in the graph name position in the quad,
|
|
// then the value was serialized using the special blank node identifier,
|
|
// "_:g", instead of "_:z".
|
|
// https://json-ld.github.io/rdf-dataset-canonicalization/spec/index.html#urgna2012
|
|
return label
|
|
}
|
|
// 3.1.1.1
|
|
if b == matching {
|
|
return "_:a"
|
|
}
|
|
return "_:z"
|
|
}
|
|
|
|
// hashNDegreeQuads is the algorithm described in section 4.8 of the spec
|
|
// at https://json-ld.github.io/rdf-dataset-canonicalization/spec/index.html#hash-n-degree-quads.
|
|
func (u *urna) hashNDegreeQuads(b string, names *issuer) ([]byte, *issuer) {
|
|
// termsFor is the hash to related blank nodes map.
|
|
termsFor := u.hashToRelated(b, names) // 1., 2. and 3.
|
|
var final []byte // 4.
|
|
|
|
for _, hash := range lexicallySortedTermHashes(termsFor) { // 5.
|
|
terms := termsFor[hash]
|
|
final = append(final, hash...) // 5.1
|
|
var chosenPath []byte // 5.2
|
|
var chosenIssuer *issuer // 5.3
|
|
p := newPermutations(terms) // 5.4
|
|
permutations:
|
|
for p.next() {
|
|
namesCopy := names.clone() // 5.4.1
|
|
var path []byte // 5.4.2
|
|
var work []string // 5.4.3
|
|
for _, b := range p.permutation() { // 5.4.4
|
|
if u.canon.has(b) { // 5.4.4.1
|
|
path = append(path, u.canon.issueFor(b)...)
|
|
} else { // 5.4.4.1
|
|
if !namesCopy.has(b) {
|
|
work = append(work, b)
|
|
}
|
|
|
|
path = append(path, namesCopy.issueFor(b)...) // 5.4.4.2.2
|
|
}
|
|
|
|
// 5.4.4.3
|
|
if len(chosenPath) != 0 && len(path) >= len(chosenPath) && bytes.Compare(path, chosenPath) > 0 {
|
|
continue permutations
|
|
}
|
|
}
|
|
|
|
for _, b := range work { // 5.4.5
|
|
hash, issuer := u.hashNDegreeQuads(b, namesCopy) // 5.4.5.1
|
|
path = append(path, namesCopy.issueFor(b)...) // 5.4.5.2
|
|
|
|
// 5.4.5.3
|
|
path = append(path, '<')
|
|
path = append(path, hash...)
|
|
path = append(path, '>')
|
|
|
|
namesCopy = issuer // 5.4.5.4
|
|
|
|
// 5.4.5.5
|
|
if len(chosenPath) != 0 && len(path) >= len(chosenPath) && bytes.Compare(path, chosenPath) > 0 {
|
|
continue permutations
|
|
}
|
|
}
|
|
|
|
if len(chosenPath) == 0 || bytes.Compare(path, chosenPath) < 0 { // 5.4.6
|
|
chosenPath = path
|
|
chosenIssuer = namesCopy
|
|
}
|
|
|
|
}
|
|
// 5.5
|
|
final = append(final, chosenPath...)
|
|
u.hash.Reset()
|
|
u.hash.Write(final)
|
|
|
|
names = chosenIssuer // 5.6
|
|
}
|
|
|
|
return hex(u.hash.Sum(nil)), names
|
|
}
|
|
|
|
// lexicallySortedTermHashes returns the lexically sorted hashes of termsFor.
|
|
func lexicallySortedTermHashes(termsFor map[string][]string) []string {
|
|
lexicalHashes := make([]string, len(termsFor))
|
|
i := 0
|
|
for h := range termsFor {
|
|
lexicalHashes[i] = h
|
|
i++
|
|
}
|
|
slices.Sort(lexicalHashes)
|
|
return lexicalHashes
|
|
}
|
|
|
|
type relatedHashCreator func(b string, names *issuer) map[string][]string
|
|
|
|
// hashToRelatedURDNA2015 is the section 1. 2. and 3. of 4.8.2 of the spec
|
|
// at https://json-ld.github.io/rdf-dataset-canonicalization/spec/index.html#hash-n-degree-quads.
|
|
func (u *urna) hashToRelatedURDNA2015(b string, names *issuer) map[string][]string {
|
|
// termsFor is the hash to related blank nodes map.
|
|
termsFor := make(map[string][]string) // 1.
|
|
|
|
for _, s := range u.statementsFor[b] { // 2. and 3.
|
|
for i, term := range []string{ // 3.1
|
|
s.Subject.Value,
|
|
s.Object.Value,
|
|
s.Label.Value,
|
|
} {
|
|
if !isBlank(term) || term == b {
|
|
continue
|
|
}
|
|
|
|
// 3.1.1
|
|
const position = "sog"
|
|
hash := u.hashRelatedBlank(term, s, names, position[i])
|
|
|
|
// 3.1.2
|
|
termsFor[string(hash)] = append(termsFor[string(hash)], term)
|
|
}
|
|
}
|
|
|
|
return termsFor
|
|
}
|
|
|
|
// hashToRelatedURGNA2012 is the section 1., 2. and 3. of 4.8.2 of the spec
|
|
// at https://json-ld.github.io/rdf-dataset-canonicalization/spec/index.html#hash-n-degree-quads
|
|
// with changes made for URGNA2012 shown in the appendix for 4.8 at
|
|
// https://json-ld.github.io/rdf-dataset-canonicalization/spec/index.html#urgna2012.
|
|
// The numbering of steps here corresponds to the spec's numbering in the
|
|
// appendix.
|
|
func (u *urna) hashToRelatedURGNA2012(b string, names *issuer) map[string][]string {
|
|
// termsFor is the hash to related blank nodes map.
|
|
termsFor := make(map[string][]string)
|
|
|
|
for _, s := range u.statementsFor[b] { // 1.
|
|
var (
|
|
term string
|
|
pos byte
|
|
)
|
|
switch {
|
|
case isBlank(s.Subject.Value) && s.Subject.Value != b: // 1.1
|
|
term = s.Subject.Value
|
|
pos = 'p'
|
|
case isBlank(s.Object.Value) && s.Object.Value != b: // 1.2
|
|
term = s.Object.Value
|
|
pos = 'r'
|
|
default:
|
|
continue // 1.3
|
|
}
|
|
|
|
// 1.4
|
|
hash := u.hashRelatedBlank(term, s, names, pos)
|
|
termsFor[string(hash)] = append(termsFor[string(hash)], term)
|
|
}
|
|
|
|
return termsFor
|
|
}
|
|
|
|
// hashNDegreeQuads is the algorithm described in section 4.7 of the spec
|
|
// https://json-ld.github.io/rdf-dataset-canonicalization/spec/index.html#hash-related-blank-node.
|
|
func (u *urna) hashRelatedBlank(term string, s *Statement, names *issuer, pos byte) []byte {
|
|
// 1.
|
|
var b string
|
|
switch {
|
|
case u.canon.has(term):
|
|
b = u.canon.issueFor(term)
|
|
case names.has(term):
|
|
b = names.issueFor(term)
|
|
default:
|
|
b = u.hashFirstDegreeQuads(term)
|
|
}
|
|
|
|
// 2.
|
|
u.hash.Reset()
|
|
u.hash.Write([]byte{pos})
|
|
|
|
if pos != 'g' { // 3.
|
|
if u.label == "" {
|
|
// URDNA2015: Term.Value retained the angle quotes
|
|
// so we don't need to add them.
|
|
u.hash.Write([]byte(s.Predicate.Value))
|
|
} else {
|
|
// URGNA2012 does not delimit predicate by < and >.
|
|
// https://json-ld.github.io/rdf-dataset-canonicalization/spec/index.html#urgna2012
|
|
// with reference to 4.7.
|
|
u.hash.Write([]byte(unquoteIRI(s.Predicate.Value)))
|
|
}
|
|
}
|
|
|
|
// 4. and 5.
|
|
u.hash.Write([]byte(b))
|
|
return hex(u.hash.Sum(nil))
|
|
}
|
|
|
|
// issuer is an identifier issuer.
|
|
type issuer struct {
|
|
prefix string
|
|
issued map[string]string
|
|
ordered []string
|
|
}
|
|
|
|
// newIssuer returns a new identifier issuer with the given prefix.
|
|
func newIssuer(prefix string) *issuer {
|
|
return &issuer{prefix: prefix, issued: make(map[string]string)}
|
|
}
|
|
|
|
// issueFor implements the issue identifier algorithm.
|
|
//
|
|
// See https://json-ld.github.io/rdf-dataset-canonicalization/spec/index.html#issue-identifier-algorithm
|
|
func (i *issuer) issueFor(b string) string {
|
|
c, ok := i.issued[b]
|
|
if ok {
|
|
return c
|
|
}
|
|
c = fmt.Sprintf("%s%d", i.prefix, len(i.issued))
|
|
i.issued[b] = c
|
|
i.ordered = append(i.ordered, b)
|
|
return c
|
|
}
|
|
|
|
func (i *issuer) has(id string) bool {
|
|
_, ok := i.issued[id]
|
|
return ok
|
|
}
|
|
|
|
func (i *issuer) clone() *issuer {
|
|
new := issuer{
|
|
prefix: i.prefix,
|
|
issued: make(map[string]string, len(i.issued)),
|
|
ordered: make([]string, len(i.ordered)),
|
|
}
|
|
copy(new.ordered, i.ordered)
|
|
for k, v := range i.issued {
|
|
new.issued[k] = v
|
|
}
|
|
return &new
|
|
}
|
|
|
|
func hex(data []byte) []byte {
|
|
const digit = "0123456789abcdef"
|
|
buf := make([]byte, 0, len(data)*2)
|
|
for _, b := range data {
|
|
buf = append(buf, digit[b>>4], digit[b&0xf])
|
|
}
|
|
return buf
|
|
}
|
|
|
|
// permutations is a string permutation generator.
|
|
type permutations struct {
|
|
src []string
|
|
dst []string
|
|
idx []int
|
|
perm *combin.PermutationGenerator
|
|
}
|
|
|
|
// newPermutation returns a new permutations.
|
|
func newPermutations(src []string) *permutations {
|
|
return &permutations{
|
|
src: src,
|
|
dst: make([]string, len(src)),
|
|
perm: combin.NewPermutationGenerator(len(src), len(src)),
|
|
idx: make([]int, len(src)),
|
|
}
|
|
}
|
|
|
|
// next returns whether there is another permutation available.
|
|
func (p *permutations) next() bool {
|
|
return p.perm.Next()
|
|
}
|
|
|
|
// permutation returns the permutation. The caller may not retain the
|
|
// returned slice between iterations.
|
|
func (p *permutations) permutation() []string {
|
|
p.perm.Permutation(p.idx)
|
|
for i, j := range p.idx {
|
|
p.dst[j] = p.src[i]
|
|
}
|
|
return p.dst
|
|
}
|