// Copyright ©2020 The Gonum Authors. All rights reserved. // Use of this source code is governed by a BSD-style // license that can be found in the LICENSE file. package rdf import ( "bytes" "crypto/sha1" "crypto/sha256" "errors" "fmt" "hash" "slices" "gonum.org/v1/gonum/stat/combin" ) // Deduplicate removes duplicate statements in s, working in place, and returns // the deduplicated slice with statements sorted in lexical order. Term UID // fields are not considered and their values may be lost during deduplication. func Deduplicate(s []*Statement) []*Statement { if len(s) < 2 { return s } sortC14nStatements(s) curr := 0 for i, e := range s { if isSameStatement(e, s[curr]) { continue } curr++ if curr < i { s[curr], s[i] = s[i], nil } } return s[:curr+1] } func isSameStatement(a, b *Statement) bool { if a == b { return true } return a.Subject.Value == b.Subject.Value && a.Predicate.Value == b.Predicate.Value && a.Object.Value == b.Object.Value && a.Label.Value == b.Label.Value } // Note on implementation details: The comment numbering in the code relates the // implementation to the steps of the algorithm described in the specification. // URGNA2012 applies the Universal RDF Graph Normalization Algorithm 2012 // to the statements in src, placing the result in dst and returning it. // If dst is nil a slice of statements will be allocated. If dst is not // nil and not the same length as src, URGNA2012 will return an error. // // See https://json-ld.github.io/rdf-dataset-canonicalization/spec/index.html for details. func URGNA2012(dst, src []*Statement) ([]*Statement, error) { if dst == nil { dst = make([]*Statement, len(src)) } else if len(dst) != len(src) { return dst, errors.New("rdf: slice length mismatch") } // 1. https://json-ld.github.io/rdf-dataset-canonicalization/spec/index.html#algorithm u := &urna{ canon: newIssuer("_:c14n"), hashes: make(map[string]string), statementsFor: make(map[string][]*Statement), hash: sha1.New(), label: "_:g", } u.hashToRelated = u.hashToRelatedURGNA2012 return u.relabel(dst, src) } // URDNA2015 applies the Universal RDF Dataset Normalization Algorithm 2015 // to the statements in src, placing the result in dst and returning it. // If dst is nil a slice of statements will be allocated. If dst is not // nil and not the same length as src, URDNA2015 will return an error. // // See https://json-ld.github.io/rdf-dataset-canonicalization/spec/index.html for details. func URDNA2015(dst, src []*Statement) ([]*Statement, error) { if dst == nil { dst = make([]*Statement, len(src)) } else if len(dst) != len(src) { return dst, errors.New("rdf: slice length mismatch") } // 1. https://json-ld.github.io/rdf-dataset-canonicalization/spec/index.html#algorithm u := &urna{ canon: newIssuer("_:c14n"), hashes: make(map[string]string), statementsFor: make(map[string][]*Statement), hash: sha256.New(), } u.hashToRelated = u.hashToRelatedURDNA2015 return u.relabel(dst, src) } // urna is the canonicalization state for the URGNA2012 and URDNA2015 // algorithms. The urna type implements both algorithms through the state // of the label and hashToRelated fields. // // See https://json-ld.github.io/rdf-dataset-canonicalization/spec/index.html#canonicalization-state // for details. type urna struct { // canon is the canonical issuer. canon *issuer // hashes holds already calculated hashes // for hashing first degree quads. hashes map[string]string // statementsFor is the blank node to quads map. statementsFor map[string][]*Statement // hash is the hash function used by the // canonicalization function. hash hash.Hash // hashToRelated holds URGNA2012 and URDNA2015- // specific hashing routines. hashToRelated relatedHashCreator // label holds "_:g" when running URGNA2012. // Otherwise it is empty. label string } // relabel is the algorithm described in section 4.4.2 of the spec at // https://json-ld.github.io/rdf-dataset-canonicalization/spec/index.html#algorithm. func (u *urna) relabel(dst, src []*Statement) ([]*Statement, error) { // termsFor is the hash to blank nodes map. // It is not held in the urna struct, but is // part of the canonicalization state. // // https://json-ld.github.io/rdf-dataset-canonicalization/spec/index.html#dfn-hash-to-blank-nodes-map var termsFor map[string][]string // 1. for _, s := range src { // 2. terms: for _, t := range []string{ s.Subject.Value, s.Object.Value, s.Label.Value, } { if !isBlank(t) { continue } for _, e := range u.statementsFor[t] { if e == s { continue terms } } u.statementsFor[t] = append(u.statementsFor[t], s) } } // todo is the list of non-normalized blank node identifiers. todo := make(map[string]bool) // 3. for b := range u.statementsFor { todo[b] = true } simple := true // 4. for simple { // 5. simple = false // 5.1 termsFor = make(map[string][]string) // 5.2 for b := range todo { // 5.3 hash := u.hashFirstDegreeQuads(b) // 5.3.1 termsFor[hash] = append(termsFor[hash], b) // 5.3.2 } for _, h := range lexicallySortedTermHashes(termsFor) { // 5.4 terms := termsFor[h] if len(terms) > 1 { // 5.4.1 continue } u.canon.issueFor(terms[0]) // 5.4.2 delete(todo, terms[0]) // 5.4.3 delete(termsFor, h) // 5.4.4 simple = true // 5.4.5 } } for _, hash := range lexicallySortedTermHashes(termsFor) { // 6. paths := make(map[string][]*issuer) // 6.1 for _, b := range termsFor[hash] { // 6.2 if u.canon.has(b) { // 6.2.1 continue } names := newIssuer("_:b") // 6.2.2 names.issueFor(b) // 6.2.3 // 6.2.4 hash, issuer := u.hashNDegreeQuads(b, names) paths[string(hash)] = append(paths[string(hash)], issuer) } for _, hash := range lexicallySortedPathHashes(paths) { // 6.3 for _, i := range paths[hash] { for _, existing := range i.ordered { // 6.3.1 u.canon.issueFor(existing) } } } } // 7. for i, s := range src { if dst[i] == nil { dst[i] = &Statement{} } n := dst[i] n.Subject = Term{Value: translateURNA(s.Subject.Value, u.canon.issued), UID: s.Subject.UID} n.Predicate = s.Predicate n.Object = Term{Value: translateURNA(s.Object.Value, u.canon.issued), UID: s.Object.UID} n.Label = Term{Value: translateURNA(s.Label.Value, u.canon.issued), UID: s.Label.UID} } sortC14nStatements(dst) return dst, nil } // lexicallySortedPathHashes returns the lexically sorted hashes of paths. func lexicallySortedPathHashes(paths map[string][]*issuer) []string { lexicalHashPaths := make([]string, len(paths)) i := 0 for h := range paths { lexicalHashPaths[i] = h i++ } slices.Sort(lexicalHashPaths) return lexicalHashPaths } func translateURNA(term string, mapping map[string]string) string { term = translate(term, mapping) if term == "" { return "" } text, qual, kind, err := extract([]rune(term)) var t Term switch kind { case Blank: return term case IRI: t, err = NewIRITerm(text) case Literal: t, err = NewLiteralTerm(text, qual) } if err != nil { panic(fmt.Errorf("rdf: invalid term %q: %w", term, err)) } return t.Value } // hashFirstDegreeQuads is the algorithm described in section 4.6 of the spec // at https://json-ld.github.io/rdf-dataset-canonicalization/spec/index.html#algorithm-1. func (u *urna) hashFirstDegreeQuads(b string) string { if h, ok := u.hashes[b]; ok { return h } var statements []*Statement // 1. for _, s := range u.statementsFor[b] { // 2. and 3. var n Statement n.Subject.Value = replaceBlank(s.Subject.Value, b, "") n.Predicate.Value = s.Predicate.Value n.Object.Value = replaceBlank(s.Object.Value, b, "") n.Label.Value = replaceBlank(s.Label.Value, b, u.label) statements = append(statements, &n) } sortC14nStatements(statements) // 4. // 5. u.hash.Reset() for _, s := range statements { fmt.Fprintln(u.hash, s) } u.hashes[b] = string(hex(u.hash.Sum(nil))) return u.hashes[b] } // replaceBlank implements 3.1 of the algorithm described at // https://json-ld.github.io/rdf-dataset-canonicalization/spec/index.html#algorithm-1. func replaceBlank(b, matching, label string) string { if !isBlank(b) { // 3.1 return b } if label != "" { // URGNA2012 modification. // When running in URGNA2012 mode, label is "_:g" for Label fields. // // If any blank node was used in the graph name position in the quad, // then the value was serialized using the special blank node identifier, // "_:g", instead of "_:z". // https://json-ld.github.io/rdf-dataset-canonicalization/spec/index.html#urgna2012 return label } // 3.1.1.1 if b == matching { return "_:a" } return "_:z" } // hashNDegreeQuads is the algorithm described in section 4.8 of the spec // at https://json-ld.github.io/rdf-dataset-canonicalization/spec/index.html#hash-n-degree-quads. func (u *urna) hashNDegreeQuads(b string, names *issuer) ([]byte, *issuer) { // termsFor is the hash to related blank nodes map. termsFor := u.hashToRelated(b, names) // 1., 2. and 3. var final []byte // 4. for _, hash := range lexicallySortedTermHashes(termsFor) { // 5. terms := termsFor[hash] final = append(final, hash...) // 5.1 var chosenPath []byte // 5.2 var chosenIssuer *issuer // 5.3 p := newPermutations(terms) // 5.4 permutations: for p.next() { namesCopy := names.clone() // 5.4.1 var path []byte // 5.4.2 var work []string // 5.4.3 for _, b := range p.permutation() { // 5.4.4 if u.canon.has(b) { // 5.4.4.1 path = append(path, u.canon.issueFor(b)...) } else { // 5.4.4.1 if !namesCopy.has(b) { work = append(work, b) } path = append(path, namesCopy.issueFor(b)...) // 5.4.4.2.2 } // 5.4.4.3 if len(chosenPath) != 0 && len(path) >= len(chosenPath) && bytes.Compare(path, chosenPath) > 0 { continue permutations } } for _, b := range work { // 5.4.5 hash, issuer := u.hashNDegreeQuads(b, namesCopy) // 5.4.5.1 path = append(path, namesCopy.issueFor(b)...) // 5.4.5.2 // 5.4.5.3 path = append(path, '<') path = append(path, hash...) path = append(path, '>') namesCopy = issuer // 5.4.5.4 // 5.4.5.5 if len(chosenPath) != 0 && len(path) >= len(chosenPath) && bytes.Compare(path, chosenPath) > 0 { continue permutations } } if len(chosenPath) == 0 || bytes.Compare(path, chosenPath) < 0 { // 5.4.6 chosenPath = path chosenIssuer = namesCopy } } // 5.5 final = append(final, chosenPath...) u.hash.Reset() u.hash.Write(final) names = chosenIssuer // 5.6 } return hex(u.hash.Sum(nil)), names } // lexicallySortedTermHashes returns the lexically sorted hashes of termsFor. func lexicallySortedTermHashes(termsFor map[string][]string) []string { lexicalHashes := make([]string, len(termsFor)) i := 0 for h := range termsFor { lexicalHashes[i] = h i++ } slices.Sort(lexicalHashes) return lexicalHashes } type relatedHashCreator func(b string, names *issuer) map[string][]string // hashToRelatedURDNA2015 is the section 1. 2. and 3. of 4.8.2 of the spec // at https://json-ld.github.io/rdf-dataset-canonicalization/spec/index.html#hash-n-degree-quads. func (u *urna) hashToRelatedURDNA2015(b string, names *issuer) map[string][]string { // termsFor is the hash to related blank nodes map. termsFor := make(map[string][]string) // 1. for _, s := range u.statementsFor[b] { // 2. and 3. for i, term := range []string{ // 3.1 s.Subject.Value, s.Object.Value, s.Label.Value, } { if !isBlank(term) || term == b { continue } // 3.1.1 const position = "sog" hash := u.hashRelatedBlank(term, s, names, position[i]) // 3.1.2 termsFor[string(hash)] = append(termsFor[string(hash)], term) } } return termsFor } // hashToRelatedURGNA2012 is the section 1., 2. and 3. of 4.8.2 of the spec // at https://json-ld.github.io/rdf-dataset-canonicalization/spec/index.html#hash-n-degree-quads // with changes made for URGNA2012 shown in the appendix for 4.8 at // https://json-ld.github.io/rdf-dataset-canonicalization/spec/index.html#urgna2012. // The numbering of steps here corresponds to the spec's numbering in the // appendix. func (u *urna) hashToRelatedURGNA2012(b string, names *issuer) map[string][]string { // termsFor is the hash to related blank nodes map. termsFor := make(map[string][]string) for _, s := range u.statementsFor[b] { // 1. var ( term string pos byte ) switch { case isBlank(s.Subject.Value) && s.Subject.Value != b: // 1.1 term = s.Subject.Value pos = 'p' case isBlank(s.Object.Value) && s.Object.Value != b: // 1.2 term = s.Object.Value pos = 'r' default: continue // 1.3 } // 1.4 hash := u.hashRelatedBlank(term, s, names, pos) termsFor[string(hash)] = append(termsFor[string(hash)], term) } return termsFor } // hashNDegreeQuads is the algorithm described in section 4.7 of the spec // https://json-ld.github.io/rdf-dataset-canonicalization/spec/index.html#hash-related-blank-node. func (u *urna) hashRelatedBlank(term string, s *Statement, names *issuer, pos byte) []byte { // 1. var b string switch { case u.canon.has(term): b = u.canon.issueFor(term) case names.has(term): b = names.issueFor(term) default: b = u.hashFirstDegreeQuads(term) } // 2. u.hash.Reset() u.hash.Write([]byte{pos}) if pos != 'g' { // 3. if u.label == "" { // URDNA2015: Term.Value retained the angle quotes // so we don't need to add them. u.hash.Write([]byte(s.Predicate.Value)) } else { // URGNA2012 does not delimit predicate by < and >. // https://json-ld.github.io/rdf-dataset-canonicalization/spec/index.html#urgna2012 // with reference to 4.7. u.hash.Write([]byte(unquoteIRI(s.Predicate.Value))) } } // 4. and 5. u.hash.Write([]byte(b)) return hex(u.hash.Sum(nil)) } // issuer is an identifier issuer. type issuer struct { prefix string issued map[string]string ordered []string } // newIssuer returns a new identifier issuer with the given prefix. func newIssuer(prefix string) *issuer { return &issuer{prefix: prefix, issued: make(map[string]string)} } // issueFor implements the issue identifier algorithm. // // See https://json-ld.github.io/rdf-dataset-canonicalization/spec/index.html#issue-identifier-algorithm func (i *issuer) issueFor(b string) string { c, ok := i.issued[b] if ok { return c } c = fmt.Sprintf("%s%d", i.prefix, len(i.issued)) i.issued[b] = c i.ordered = append(i.ordered, b) return c } func (i *issuer) has(id string) bool { _, ok := i.issued[id] return ok } func (i *issuer) clone() *issuer { new := issuer{ prefix: i.prefix, issued: make(map[string]string, len(i.issued)), ordered: make([]string, len(i.ordered)), } copy(new.ordered, i.ordered) for k, v := range i.issued { new.issued[k] = v } return &new } func hex(data []byte) []byte { const digit = "0123456789abcdef" buf := make([]byte, 0, len(data)*2) for _, b := range data { buf = append(buf, digit[b>>4], digit[b&0xf]) } return buf } // permutations is a string permutation generator. type permutations struct { src []string dst []string idx []int perm *combin.PermutationGenerator } // newPermutation returns a new permutations. func newPermutations(src []string) *permutations { return &permutations{ src: src, dst: make([]string, len(src)), perm: combin.NewPermutationGenerator(len(src), len(src)), idx: make([]int, len(src)), } } // next returns whether there is another permutation available. func (p *permutations) next() bool { return p.perm.Next() } // permutation returns the permutation. The caller may not retain the // returned slice between iterations. func (p *permutations) permutation() []string { p.perm.Permutation(p.idx) for i, j := range p.idx { p.dst[j] = p.src[i] } return p.dst }