mirror of
https://github.com/gonum/gonum.git
synced 2025-10-05 07:06:54 +08:00
577 lines
13 KiB
Go
577 lines
13 KiB
Go
// Copyright ©2021 The Gonum Authors. All rights reserved.
|
||
// Use of this source code is governed by a BSD-style
|
||
// license that can be found in the LICENSE file.
|
||
|
||
package rdf
|
||
|
||
import (
|
||
"errors"
|
||
"sort"
|
||
)
|
||
|
||
// Throughout, the comments refer to doi:10.1145/3068333 which should be
|
||
// understood as a synonym for http://aidanhogan.com/docs/rdf-canonicalisation.pdf
|
||
// although there are differences between the two, see http://aidanhogan.com/#errataH17.
|
||
// Where there are differences, the document at http://aidanhogan.com/ is the
|
||
// canonical truth. The DOI reference is referred to for persistence.
|
||
|
||
// Lean returns an RDF core of g that entails g. If g contains any non-zero
|
||
// labels, Lean will return a non-nil error and a core of g assuming no graph
|
||
// labels exist.
|
||
//
|
||
// See http://aidanhogan.com/docs/rdf-canonicalisation.pdf for details of
|
||
// the algorithm.
|
||
func Lean(g []*Statement) ([]*Statement, error) {
|
||
// BUG(kortschak): Graph leaning does not take into account graph label terms
|
||
// since the formal semantics for a multiple graph data model have not been
|
||
// defined. See https://www.w3.org/TR/rdf11-datasets/#declaring.
|
||
|
||
var (
|
||
hasBlanks bool
|
||
err error
|
||
)
|
||
for _, s := range g {
|
||
if isBlank(s.Subject.Value) || isBlank(s.Object.Value) {
|
||
hasBlanks = true
|
||
if err != nil {
|
||
break
|
||
}
|
||
}
|
||
if s.Label.Value != "" && err == nil {
|
||
err = errors.New("rdf: data-set contains graph names")
|
||
if hasBlanks {
|
||
break
|
||
}
|
||
}
|
||
}
|
||
if hasBlanks {
|
||
g = lean(&dfs{}, g)
|
||
}
|
||
return g, err
|
||
}
|
||
|
||
// removeRedundantBnodes removes blank nodes whose edges are a subset of
|
||
// another term in the RDF graph.
|
||
//
|
||
// This is algorithm 4 in doi:10.1145/3068333.
|
||
func removeRedundantBnodes(g []*Statement) []*Statement {
|
||
g = append(g[:0:0], g...)
|
||
for {
|
||
edges := make(map[string]map[triple]bool)
|
||
for _, s := range g {
|
||
for i, t := range []string{
|
||
s.Subject.Value,
|
||
s.Object.Value,
|
||
} {
|
||
e, ok := edges[t]
|
||
if !ok {
|
||
e = make(map[triple]bool)
|
||
edges[t] = e
|
||
}
|
||
switch i {
|
||
case 0:
|
||
e[triple{s.Predicate.Value, s.Object.Value, "+"}] = true
|
||
case 1:
|
||
e[triple{s.Predicate.Value, s.Subject.Value, "-"}] = true
|
||
}
|
||
}
|
||
}
|
||
|
||
seen := make(map[string]bool)
|
||
bNodes := make(map[string]bool)
|
||
terms := make(map[string]bool)
|
||
for _, s := range g {
|
||
for _, t := range []string{
|
||
s.Subject.Value,
|
||
s.Predicate.Value,
|
||
s.Object.Value,
|
||
} {
|
||
terms[t] = true
|
||
if isBlank(t) {
|
||
bNodes[t] = true
|
||
} else {
|
||
seen[t] = true
|
||
}
|
||
}
|
||
}
|
||
|
||
redundant := make(map[string]bool)
|
||
for x := range bNodes {
|
||
for xp := range terms {
|
||
if isProperSubset(edges[x], edges[xp]) || (seen[xp] && isEqualEdges(edges[x], edges[xp])) {
|
||
redundant[x] = true
|
||
break
|
||
}
|
||
}
|
||
seen[x] = true
|
||
}
|
||
|
||
n := len(g)
|
||
for i := 0; i < len(g); {
|
||
if !redundant[g[i].Subject.Value] && !redundant[g[i].Object.Value] {
|
||
i++
|
||
continue
|
||
}
|
||
g[i], g = g[len(g)-1], g[:len(g)-1]
|
||
}
|
||
if n == len(g) {
|
||
return g
|
||
}
|
||
}
|
||
}
|
||
|
||
type triple [3]string
|
||
|
||
func isProperSubset(a, b map[triple]bool) bool {
|
||
for k := range a {
|
||
if !b[k] {
|
||
return false
|
||
}
|
||
}
|
||
return len(a) < len(b)
|
||
}
|
||
|
||
func isEqualEdges(a, b map[triple]bool) bool {
|
||
if len(a) != len(b) {
|
||
return false
|
||
}
|
||
for k := range a {
|
||
if !b[k] {
|
||
return false
|
||
}
|
||
}
|
||
return true
|
||
}
|
||
|
||
// findCandidates finds candidates for blank nodes and blank nodes that are fixed.
|
||
//
|
||
// This is algorithm 5 in doi:10.1145/3068333.
|
||
func findCandidates(g []*Statement) ([]*Statement, map[string]bool, map[string]map[string]bool, bool) {
|
||
g = removeRedundantBnodes(g)
|
||
|
||
edges := make(map[triple]bool)
|
||
f := make(map[string]bool)
|
||
for _, s := range g {
|
||
sub := s.Subject.Value
|
||
prd := s.Predicate.Value
|
||
obj := s.Object.Value
|
||
|
||
edges[triple{sub, prd, obj}] = true
|
||
edges[triple{sub, prd, "*"}] = true
|
||
edges[triple{"*", prd, obj}] = true
|
||
switch {
|
||
case isBlank(sub) && isBlank(obj):
|
||
f[sub] = false
|
||
f[obj] = false
|
||
case isBlank(sub):
|
||
if _, ok := f[sub]; !ok {
|
||
f[sub] = true
|
||
}
|
||
case isBlank(obj):
|
||
if _, ok := f[obj]; !ok {
|
||
f[obj] = true
|
||
}
|
||
}
|
||
}
|
||
for k, v := range f {
|
||
if !v {
|
||
delete(f, k)
|
||
}
|
||
}
|
||
if len(f) == 0 {
|
||
f = nil
|
||
}
|
||
|
||
cands := make(map[string]map[string]bool)
|
||
bnodes := make(map[string]bool)
|
||
for _, s := range g {
|
||
for _, b := range []string{
|
||
s.Subject.Value,
|
||
s.Object.Value,
|
||
} {
|
||
if !isBlank(b) {
|
||
continue
|
||
}
|
||
bnodes[b] = true
|
||
if f[b] {
|
||
cands[b] = map[string]bool{b: true}
|
||
} else {
|
||
terms := make(map[string]bool)
|
||
for _, s := range g {
|
||
for _, t := range []string{
|
||
s.Subject.Value,
|
||
s.Predicate.Value,
|
||
s.Object.Value,
|
||
} {
|
||
terms[t] = true
|
||
}
|
||
}
|
||
cands[b] = terms
|
||
}
|
||
}
|
||
}
|
||
if isEqualTerms(f, bnodes) {
|
||
return g, f, cands, true
|
||
}
|
||
|
||
for {
|
||
bb := make(map[string]bool)
|
||
for b := range bnodes {
|
||
if !f[b] {
|
||
bb[b] = true
|
||
}
|
||
}
|
||
for b := range bb {
|
||
for x := range cands[b] {
|
||
if x == b {
|
||
continue
|
||
}
|
||
for _, s := range g {
|
||
if s.Subject.Value != b {
|
||
continue
|
||
}
|
||
prd := s.Predicate.Value
|
||
obj := s.Object.Value
|
||
if (inILF(obj, f) && !edges[triple{x, prd, obj}]) || (bb[obj] && !edges[triple{x, prd, "*"}]) {
|
||
delete(cands[b], x)
|
||
break
|
||
}
|
||
}
|
||
if !cands[b][x] {
|
||
continue
|
||
}
|
||
for _, s := range g {
|
||
if s.Object.Value != b {
|
||
continue
|
||
}
|
||
sub := s.Subject.Value
|
||
prd := s.Predicate.Value
|
||
if (inIF(sub, f) && !edges[triple{sub, prd, x}]) || (bb[sub] && !edges[triple{"*", prd, x}]) {
|
||
delete(cands[b], x)
|
||
break
|
||
}
|
||
}
|
||
}
|
||
}
|
||
|
||
fp := f
|
||
f = make(map[string]bool)
|
||
for b := range fp {
|
||
f[b] = true
|
||
}
|
||
for b := range bb { // Mark newly fixed blank nodes.
|
||
if len(cands[b]) == 1 && cands[b][b] {
|
||
f[b] = true
|
||
}
|
||
}
|
||
allFixed := isEqualTerms(f, bnodes)
|
||
if isEqualTerms(fp, f) || allFixed {
|
||
if len(f) == 0 {
|
||
f = nil
|
||
}
|
||
return g, f, cands, allFixed
|
||
}
|
||
}
|
||
}
|
||
|
||
// inILF returns whether t is in IL or F.
|
||
func inILF(t string, f map[string]bool) bool {
|
||
return isIRI(t) || isLiteral(t) || f[t]
|
||
}
|
||
|
||
// inIF returns whether t is in I or F.
|
||
func inIF(t string, f map[string]bool) bool {
|
||
return isIRI(t) || f[t]
|
||
}
|
||
|
||
// dfs is a depth-first search strategy.
|
||
type dfs struct{}
|
||
|
||
// lean returns a core of the RDF graph g using the given strategy.
|
||
//
|
||
// This is lines 1-9 of algorithm 6 in doi:10.1145/3068333.
|
||
func lean(strategy *dfs, g []*Statement) []*Statement {
|
||
foundBnode := false
|
||
search:
|
||
for _, s := range g {
|
||
for _, t := range []string{
|
||
s.Subject.Value,
|
||
s.Object.Value,
|
||
} {
|
||
if isBlank(t) {
|
||
foundBnode = true
|
||
break search
|
||
}
|
||
}
|
||
}
|
||
if !foundBnode {
|
||
return g
|
||
}
|
||
g, fixed, cands, allFixed := findCandidates(g)
|
||
if allFixed {
|
||
return g
|
||
}
|
||
for _, s := range g {
|
||
if isBlank(s.Subject.Value) && isBlank(s.Object.Value) {
|
||
mu := make(map[string]string, len(fixed))
|
||
for b := range fixed {
|
||
mu[b] = b
|
||
}
|
||
mu = findCoreEndomorphism(strategy, g, cands, mu)
|
||
return applyMu(g, mu)
|
||
}
|
||
}
|
||
return g
|
||
}
|
||
|
||
// findCoreEndomorphism returns a core solution using the given strategy.
|
||
//
|
||
// This is lines 10-14 of algorithm 6 in doi:10.1145/3068333.
|
||
func findCoreEndomorphism(strategy *dfs, g []*Statement, cands map[string]map[string]bool, mu map[string]string) map[string]string {
|
||
var q []*Statement
|
||
preds := make(map[string]int)
|
||
seen := make(map[triple]bool)
|
||
for _, s := range g {
|
||
preds[s.Predicate.Value]++
|
||
if isBlank(s.Subject.Value) && isBlank(s.Object.Value) {
|
||
if seen[triple{s.Subject.Value, s.Predicate.Value, s.Object.Value}] {
|
||
continue
|
||
}
|
||
seen[triple{s.Subject.Value, s.Predicate.Value, s.Object.Value}] = true
|
||
q = append(q, s)
|
||
}
|
||
}
|
||
sort.Slice(q, func(i, j int) bool {
|
||
return selectivity(q[i], cands, preds) < selectivity(q[j], cands, preds)
|
||
})
|
||
return strategy.evaluate(g, q, cands, mu)
|
||
}
|
||
|
||
// selectivity returns the selectivity heuristic score for s. Lower scores
|
||
// are more selective.
|
||
func selectivity(s *Statement, cands map[string]map[string]bool, preds map[string]int) int {
|
||
return min(len(cands[s.Subject.Value])*len(cands[s.Object.Value]), preds[s.Predicate.Value])
|
||
}
|
||
|
||
// evaluate returns an endomorphism using a DFS strategy.
|
||
//
|
||
// This is lines 25-32 of algorithm 6 in doi:10.1145/3068333.
|
||
func (st *dfs) evaluate(g, q []*Statement, cands map[string]map[string]bool, mu map[string]string) map[string]string {
|
||
mu = st.search(g, q, cands, mu)
|
||
for len(mu) != len(codom(mu)) {
|
||
mupp := fixedFrom(cands)
|
||
mup := findCoreEndomorphism(st, applyMu(g, mu), cands, mupp)
|
||
if isAutomorphism(mup) {
|
||
return mu
|
||
}
|
||
for b, x := range mu {
|
||
if _, ok := mup[b]; !ok {
|
||
mup[b] = x
|
||
}
|
||
}
|
||
mu = mup
|
||
}
|
||
return mu
|
||
}
|
||
|
||
func fixedFrom(cands map[string]map[string]bool) map[string]string {
|
||
fixed := make(map[string]string)
|
||
for b, m := range cands {
|
||
if len(m) == 1 && m[b] {
|
||
fixed[b] = b
|
||
}
|
||
}
|
||
return fixed
|
||
}
|
||
|
||
// applyMu applies mu to g returning the result.
|
||
func applyMu(g []*Statement, mu map[string]string) []*Statement {
|
||
back := make([]Statement, 0, len(g))
|
||
dst := make([]*Statement, 0, len(g))
|
||
seen := make(map[Statement]bool)
|
||
for _, s := range g {
|
||
n := Statement{
|
||
Subject: Term{Value: translate(s.Subject.Value, mu)},
|
||
Predicate: Term{Value: s.Predicate.Value},
|
||
Object: Term{Value: translate(s.Object.Value, mu)},
|
||
Label: Term{Value: s.Label.Value},
|
||
}
|
||
if seen[n] {
|
||
continue
|
||
}
|
||
seen[n] = true
|
||
back = append(back, n)
|
||
dst = append(dst, &back[len(back)-1])
|
||
}
|
||
return dst
|
||
}
|
||
|
||
// search returns a minimum endomorphism using a DFS strategy.
|
||
//
|
||
// This is lines 33-46 of algorithm 6 in doi:10.1145/3068333.
|
||
func (st *dfs) search(g, q []*Statement, cands map[string]map[string]bool, mu map[string]string) map[string]string {
|
||
qMin := q[0]
|
||
m := st.join(qMin, g, cands, mu)
|
||
if len(m) == 0 {
|
||
// Early exit if no mapping found.
|
||
return nil
|
||
}
|
||
sortByCodom(m)
|
||
mMin := m[0]
|
||
qp := q[1:]
|
||
if len(qp) != 0 {
|
||
for len(m) != 0 {
|
||
mMin = m[0]
|
||
mup := st.search(g, qp, cands, mMin)
|
||
if !isAutomorphism(mup) {
|
||
return mup
|
||
}
|
||
m = m[1:]
|
||
}
|
||
}
|
||
return mMin
|
||
}
|
||
|
||
// isAutomorphism returns whether mu is an automorphism, this is equivalent to
|
||
// dom(mu) == codom(mu).
|
||
func isAutomorphism(mu map[string]string) bool {
|
||
return isEqualTerms(dom(mu), codom(mu))
|
||
}
|
||
|
||
// dom returns the domain of mu.
|
||
func dom(mu map[string]string) map[string]bool {
|
||
d := make(map[string]bool, len(mu))
|
||
for v := range mu {
|
||
d[v] = true
|
||
}
|
||
return d
|
||
}
|
||
|
||
// codom returns the codomain of mu.
|
||
func codom(mu map[string]string) map[string]bool {
|
||
cd := make(map[string]bool, len(mu))
|
||
for _, v := range mu {
|
||
cd[v] = true
|
||
}
|
||
return cd
|
||
}
|
||
|
||
// isEqualTerms returns whether a and b are identical.
|
||
func isEqualTerms(a, b map[string]bool) bool {
|
||
if len(a) != len(b) {
|
||
return false
|
||
}
|
||
for k := range a {
|
||
if !b[k] {
|
||
return false
|
||
}
|
||
}
|
||
return true
|
||
}
|
||
|
||
// sortByCodom performs a sort of maps ordered by fewest blank nodes in
|
||
// codomain, then fewest self mappings.
|
||
func sortByCodom(maps []map[string]string) {
|
||
m := orderedByCodom{
|
||
maps: maps,
|
||
attrs: make([]attrs, len(maps)),
|
||
}
|
||
for i, mu := range maps {
|
||
m.attrs[i].blanks = make(map[string]bool)
|
||
for x, y := range mu {
|
||
if isBlank(y) {
|
||
m.attrs[i].blanks[y] = true
|
||
}
|
||
if x == y {
|
||
m.attrs[i].selfs++
|
||
}
|
||
}
|
||
}
|
||
sort.Sort(m)
|
||
}
|
||
|
||
type orderedByCodom struct {
|
||
maps []map[string]string
|
||
attrs []attrs
|
||
}
|
||
|
||
type attrs struct {
|
||
blanks map[string]bool
|
||
selfs int
|
||
}
|
||
|
||
func (m orderedByCodom) Len() int { return len(m.maps) }
|
||
func (m orderedByCodom) Less(i, j int) bool {
|
||
attrI := m.attrs[i]
|
||
attrJ := m.attrs[j]
|
||
switch {
|
||
case len(attrI.blanks) < len(attrJ.blanks):
|
||
return true
|
||
case len(attrI.blanks) > len(attrJ.blanks):
|
||
return false
|
||
default:
|
||
return attrI.selfs < attrJ.selfs
|
||
}
|
||
}
|
||
func (m orderedByCodom) Swap(i, j int) {
|
||
m.maps[i], m.maps[j] = m.maps[j], m.maps[i]
|
||
m.attrs[i], m.attrs[j] = m.attrs[j], m.attrs[i]
|
||
}
|
||
|
||
// join evaluates the given pattern, q, joining with solutions in m.
|
||
// This takes only a single mapping and so only works for the DFS strategy.
|
||
//
|
||
// This is lines 47-51 of algorithm 6 in doi:10.1145/3068333.
|
||
func (st *dfs) join(q *Statement, g []*Statement, cands map[string]map[string]bool, m map[string]string) []map[string]string {
|
||
var mp []map[string]string
|
||
isLoop := q.Subject.Value == q.Object.Value
|
||
for _, s := range g {
|
||
// Line 45: M_q ← {µ | µ(q) ∈ G}
|
||
// | µ(q) ∈ G
|
||
//
|
||
// µ(q) ∈ G ↔ (µ(q_s),q_p,µ(q_o)) ∈ G
|
||
if q.Predicate.Value != s.Predicate.Value {
|
||
continue
|
||
}
|
||
// q_s = q_o ↔ µ(q_s) =_µ(q_o)
|
||
if isLoop && s.Subject.Value != s.Object.Value {
|
||
continue
|
||
}
|
||
|
||
// Line 46: M_q' ← {µ ∈ M_q | for all b ∈ bnodes({q}), µ(b) ∈ cands[b]}
|
||
// | for all b ∈ bnodes({q}), µ(b) ∈ cands[b]
|
||
if !cands[q.Subject.Value][s.Subject.Value] || !cands[q.Object.Value][s.Object.Value] {
|
||
continue
|
||
}
|
||
|
||
// Line 47: M' ← M_q' ⋈ M
|
||
// M₁ ⋈ M₂ = {μ₁ ∪ μ₂ | μ₁ ∈ M₁, μ₂ ∈ M₂ and μ₁, μ₂ are compatible mappings}
|
||
// | μ₁ ∈ M₁, μ₂ ∈ M₂ and μ₁, μ₂ are compatible mappings
|
||
if mq, ok := m[q.Subject.Value]; ok && mq != s.Subject.Value {
|
||
continue
|
||
}
|
||
if !isLoop {
|
||
if mq, ok := m[q.Object.Value]; ok && mq != s.Object.Value {
|
||
continue
|
||
}
|
||
}
|
||
// Line 47: μ₁ ∪ μ₂
|
||
var mu map[string]string
|
||
if isLoop {
|
||
mu = map[string]string{
|
||
q.Subject.Value: s.Subject.Value,
|
||
}
|
||
} else {
|
||
mu = map[string]string{
|
||
q.Subject.Value: s.Subject.Value,
|
||
q.Object.Value: s.Object.Value,
|
||
}
|
||
}
|
||
for b, mb := range m {
|
||
mu[b] = mb
|
||
}
|
||
mp = append(mp, mu)
|
||
}
|
||
return mp
|
||
}
|