// Copyright ©2020 The Gonum Authors. All rights reserved. // Use of this source code is governed by a BSD-style // license that can be found in the LICENSE file. package rdf import ( "cmp" "crypto/md5" "flag" "fmt" "hash" "io" "os" "path/filepath" "reflect" "slices" "sort" "testing" "text/tabwriter" "time" "golang.org/x/exp/rand" ) var ( origSeed = flag.Int64("seed", 1, "specify random seed to use for each test (negative for Unix time)") tests = flag.String("test", "*-in.n[qt]", "specify test case in testdata") ) func TestIsoCanonicalHashes(t *testing.T) { seed := uint64(*origSeed) if *origSeed < 0 { seed = uint64(time.Now().UnixNano()) } defer func() { if t.Failed() && *origSeed < 0 { t.Logf("time based seed: %d", seed) } }() // Number of times to run IsoCanonicalHashes to check consistency. const retries = 5 // Share a global hash function to ensure that we // are resetting the function internally on each use. hash := md5.New() glob, err := filepath.Glob(filepath.Join("testdata", *tests)) if err != nil { t.Fatalf("Failed to open test suite: %v", err) } for _, path := range glob { name := filepath.Base(path) t.Run(name, func(t *testing.T) { src := rand.NewSource(seed) f, err := os.Open(path) if err != nil { t.Fatalf("Failed to open test suite in %q: %v", path, err) } var statements []*Statement dec := NewDecoder(f) for { s, err := dec.Unmarshal() if err != nil { if err == io.EOF { break } t.Fatalf("Unexpected error reading from %q: %v", path, err) } statements = append(statements, s) } f.Close() for _, decomp := range []bool{false, true} { t.Run(fmt.Sprintf("decomp=%t", decomp), func(t *testing.T) { var last map[string][]byte for i := 0; i < retries; i++ { curr, terms := IsoCanonicalHashes(statements, decomp, true, hash, make([]byte, 16)) if !hashesDisjoint(terms) { t.Errorf("IsoCanonicalHashes did not uniquely identify nodes %q with decomp=%t", name, decomp) } if last != nil { last := relabelStatements(statements, termsFor(last, hash)) sortSimpleLexicalStatements(last) curr := relabelStatements(statements, termsFor(curr, hash)) sortSimpleLexicalStatements(curr) if !reflect.DeepEqual(last, curr) { t.Errorf("IsoCanonicalHashes was not stable between runs on %q with decomp=%t", name, decomp) t.Log("Current run:") for _, s := range curr { t.Logf("\t%s", s) } t.Log("Previous run:") for _, s := range last { t.Logf("\t%s", s) } break } } last = curr } hashes := last ok := allUnique(hashes) if !ok { t.Errorf("Failed to get unique hashes for %q disjoint with decomp=%t", name, decomp) t.Logf("skipping %q decomp=%t", path, decomp) return } // Test that a graph is not isomorphic with one generated // by deleting the last statement. t.Run("isomorphic G != G-s", func(t *testing.T) { if len(statements) == 0 { return } if Isomorphic(statements, statements[:len(statements)-1], decomp, hash) { t.Error("Isomorphic(G, G-s)=true") } }) // Test that a graph is not isomorphic with one generated // by hashing the first grounded statement. t.Run("isomorphic G != Gμ(g)", func(t *testing.T) { mangled, mangTerms := mangleFirstIL(statements, hash) if mangTerms == nil { // All terms were blanks. return } if Isomorphic(statements, mangled, decomp, hash) { t.Error("Isomorphic(G, Gμ(g))=true") } }) // Test that a graph is not isomorphic with one generated // by merging the first two lexically sorted blank nodes // into one. t.Run("isomorphic G != G(b1∪b2)", func(t *testing.T) { mangled, mangTerms := mergeFirst2B(statements) if mangTerms == nil { // All terms were blanks. return } if Isomorphic(statements, mangled, decomp, hash) { t.Error("Isomorphic(G, G(b1∪b2))=true") } }) // Relabel a copy of the statements and then sort. orig := relabelStatements(statements, termsFor(hashes, hash)) sortSimpleLexicalStatements(orig) for _, perm := range []struct { name string data func() ([]*Statement, map[string]string) }{ { name: "reverse statements", data: func() ([]*Statement, map[string]string) { return reverseStatements(statements) }, }, { name: "permute statements", data: func() ([]*Statement, map[string]string) { return permuteStatements(statements, src) }, }, { name: "permute blank labels", data: func() ([]*Statement, map[string]string) { return permuteBlanks(statements, src) }, }, { name: "hash blank labels", data: func() ([]*Statement, map[string]string) { return hashBlanks(statements, md5.New()) }, }, { name: "reverse statements and hash blank labels", data: func() ([]*Statement, map[string]string) { // Reordering must come first since it does not return // a non-nil terms map, but hashBlanks does. s, _ := reverseStatements(statements) return hashBlanks(s, md5.New()) }, }, { name: "permute statements and hash blank labels", data: func() ([]*Statement, map[string]string) { // Reordering must come first since it does not return // a non-nil terms map, but hashBlanks does. s, _ := permuteStatements(statements, src) return hashBlanks(s, md5.New()) }, }, } { t.Run(perm.name, func(t *testing.T) { if debug { fmt.Fprintf(os.Stderr, "\n%q %q decomp=%t:\n", path, perm.name, decomp) } altStatements, terms := perm.data() altHashes, altTerms := IsoCanonicalHashes(altStatements, decomp, true, hash, make([]byte, 16)) ok := allUnique(altHashes) && hashesDisjoint(altTerms) if !ok { t.Errorf("Failed to get unique hashes for %q alternative disjoint %q with decomp=%t", path, perm.name, decomp) } if debug { fmt.Fprintln(os.Stderr, "Name mappings from original dataset:") keys := make([]string, len(hashes)) var i int for k := range hashes { keys[i] = k i++ } slices.Sort(keys) w := tabwriter.NewWriter(os.Stderr, 0, 4, 8, ' ', 0) for _, k := range keys { fmt.Fprintf(w, "\t%s\t%s\n", k, translate(k, terms)) } w.Flush() fmt.Fprintln(os.Stderr) } // Relabel a copy of the alternative statements and then sort. alt := relabelStatements(altStatements, termsFor(altHashes, hash)) sortSimpleLexicalStatements(alt) for i := range statements { if *orig[i] != *alt[i] { // Otherwise we have pointer inequality. t.Errorf("Unexpected statement in %q %q decomp=%t:\ngot: %#v\nwant:%#v", path, perm.name, decomp, orig[i], alt[i]) break } } if !Isomorphic(statements, altStatements, decomp, hash) { t.Errorf("Isomorphic(G, perm(G))=false in %q %q decomp=%t", path, perm.name, decomp) } }) } }) } }) } } func permuteStatements(s []*Statement, src rand.Source) ([]*Statement, map[string]string) { rnd := rand.New(src) m := make([]*Statement, len(s)) for x, y := range rnd.Perm(len(s)) { m[x] = s[y] } return m, nil } func reverseStatements(s []*Statement) ([]*Statement, map[string]string) { m := make([]*Statement, len(s)) for i, j := 0, len(s)-1; i < len(s); i, j = i+1, j-1 { m[j] = s[i] } return m, nil } func permuteBlanks(s []*Statement, src rand.Source) ([]*Statement, map[string]string) { rnd := rand.New(src) terms := make(map[string]string) for _, e := range s { for _, t := range []string{ e.Subject.Value, e.Predicate.Value, e.Object.Value, e.Label.Value, } { if t == "" { continue } terms[t] = t } } var blanks []string for t := range terms { if isBlank(t) { blanks = append(blanks, t) } } slices.Sort(blanks) for x, y := range rnd.Perm(len(blanks)) { terms[blanks[x]] = blanks[y] } m := relabelStatements(s, terms) return m, terms } func hashBlanks(s []*Statement, h hash.Hash) ([]*Statement, map[string]string) { terms := make(map[string]string) for _, e := range s { for _, t := range []string{ e.Subject.Value, e.Predicate.Value, e.Object.Value, e.Label.Value, } { if !isBlank(t) { continue } h.Reset() h.Write([]byte(t)) terms[t] = fmt.Sprintf("_:%0*x", 2*h.Size(), h.Sum(nil)) } } m := relabelStatements(s, terms) return m, terms } func mangleFirstIL(s []*Statement, h hash.Hash) ([]*Statement, map[string]string) { terms := make(map[string]string) for _, e := range s { for _, t := range []string{ e.Subject.Value, e.Predicate.Value, e.Object.Value, e.Label.Value, } { if isBlank(t) { continue } h.Reset() h.Write([]byte(t)) terms[t] = fmt.Sprintf(`"%0*x"`, 2*h.Size(), h.Sum(nil)) return relabelStatements(s, terms), terms } } m := relabelStatements(s, nil) return m, nil } func mergeFirst2B(s []*Statement) ([]*Statement, map[string]string) { terms := make(map[string]string) for _, e := range s { for _, t := range []string{ e.Subject.Value, e.Predicate.Value, e.Object.Value, e.Label.Value, } { if !isBlank(t) { continue } terms[t] = t } } if len(terms) < 2 { return relabelStatements(s, nil), nil } blanks := make([]string, len(terms)) i := 0 for _, b := range terms { blanks[i] = b i++ } slices.Sort(blanks) terms[blanks[1]] = terms[blanks[0]] m := relabelStatements(s, terms) return m, nil } func hashesDisjoint(terms map[string]map[string]bool) bool { for _, t := range terms { if len(t) != 1 { return false } } return true } func TestLexicalStatements(t *testing.T) { if *tests == "" { *tests = "*" } hash := md5.New() glob, err := filepath.Glob(filepath.Join("testdata", *tests)) if err != nil { t.Fatalf("Failed to open test suite: %v", err) } for _, path := range glob { f, err := os.Open(path) if err != nil { t.Fatalf("Failed to open test suite in %q: %v", path, err) } var statements []*Statement dec := NewDecoder(f) for { s, err := dec.Unmarshal() if err != nil { if err == io.EOF { break } t.Fatalf("Unexpected error reading from %q: %v", path, err) } statements = append(statements, s) } f.Close() for _, decomp := range []bool{false, true} { hashes, _ := IsoCanonicalHashes(statements, decomp, true, hash, make([]byte, 16)) terms := termsFor(hashes, hash) // Sort a copy of the statements based on hashes and then relabel. indirect := make([]*Statement, len(statements)) copy(indirect, statements) sort.Sort(lexicalStatements{indirect, hashes}) indirect = relabelStatements(indirect, terms) // Relabel a copy of the statements and then sort. direct := relabelStatements(statements, terms) sortSimpleLexicalStatements(direct) for i := range statements { if *indirect[i] != *direct[i] { // Otherwise we have pointer inequality. t.Errorf("Unexpected ordering of indirect sort in %q:\ngot: %#v\nwant:%#v", path, indirect[i], direct[i]) } } } } } func termsFor(hashes map[string][]byte, hash hash.Hash) map[string]string { terms := make(map[string]string) for t, h := range hashes { if isBlank(t) { terms[t] = fmt.Sprintf("_:%0*x", 2*hash.Size(), h) } } return terms } // sortSimpleLexicalStatements implements lexical statement sorting on the // literal values without interpolation. func sortSimpleLexicalStatements(statements []*Statement) { slices.SortFunc(statements, func(a, b *Statement) int { if n := cmp.Compare(unquoteIRI(a.Subject.Value), unquoteIRI(b.Subject.Value)); n != 0 { return n } // Always IRI. if n := cmp.Compare(unquoteIRI(a.Predicate.Value), unquoteIRI(b.Predicate.Value)); n != 0 { return n } return cmp.Compare(unquoteIRI(a.Object.Value), unquoteIRI(b.Object.Value)) }) } func relabelStatements(s []*Statement, terms map[string]string) []*Statement { m := make([]*Statement, len(s)) for i, e := range s { n := *e n.Subject = Term{Value: translate(n.Subject.Value, terms)} n.Predicate = Term{Value: translate(n.Predicate.Value, terms)} n.Object = Term{Value: translate(n.Object.Value, terms)} n.Label = Term{Value: translate(n.Label.Value, terms)} m[i] = &n } return m } func BenchmarkIsoCanonicalHashes(b *testing.B) { hash := md5.New() benchmarks := []string{ "test019-in.nq", "test044-in.nq", } for _, name := range benchmarks { path := filepath.Join("testdata", name) b.Run(name, func(b *testing.B) { f, err := os.Open(path) if err != nil { b.Fatalf("Failed to open test suite in %q: %v", path, err) } var statements []*Statement dec := NewDecoder(f) for { s, err := dec.Unmarshal() if err != nil { if err == io.EOF { break } b.Fatalf("Unexpected error reading from %q: %v", path, err) } statements = append(statements, s) } f.Close() nodes := make(map[string]bool) for _, s := range statements { for _, t := range []string{ s.Subject.Value, s.Predicate.Value, s.Object.Value, s.Label.Value, } { if t != "" { nodes[t] = true } } } n := len(nodes) for _, decomp := range []bool{false, true} { b.Run(fmt.Sprintf("decomp=%t", decomp), func(b *testing.B) { for i := 0; i < b.N; i++ { hashes, _ := IsoCanonicalHashes(statements, decomp, true, hash, make([]byte, 16)) if len(hashes) != n { b.Fatalf("unexpected number of hashes: %d != %d", len(hashes), len(statements)) } } }) } }) } }