mirror of
				https://github.com/gonum/gonum.git
				synced 2025-10-31 02:26:59 +08:00 
			
		
		
		
	
		
			
				
	
	
		
			398 lines
		
	
	
		
			9.7 KiB
		
	
	
	
		
			Go
		
	
	
	
	
	
			
		
		
	
	
			398 lines
		
	
	
		
			9.7 KiB
		
	
	
	
		
			Go
		
	
	
	
	
	
| // Copyright ©2020 The Gonum Authors. All rights reserved.
 | |
| // Use of this source code is governed by a BSD-style
 | |
| // license that can be found in the LICENSE file.
 | |
| 
 | |
| //go:generate ragel -Z -G2 parse.rl
 | |
| //go:generate ragel -Z -G2 extract.rl
 | |
| //go:generate ragel -Z -G2 check.rl
 | |
| //go:generate stringer -type=Kind
 | |
| 
 | |
| package rdf
 | |
| 
 | |
| import (
 | |
| 	"bufio"
 | |
| 	"bytes"
 | |
| 	"errors"
 | |
| 	"fmt"
 | |
| 	"io"
 | |
| 	"net/url"
 | |
| 	"strconv"
 | |
| 	"strings"
 | |
| 	"unicode"
 | |
| 	"unicode/utf8"
 | |
| 
 | |
| 	"gonum.org/v1/gonum/graph"
 | |
| )
 | |
| 
 | |
| var (
 | |
| 	_ graph.Node = Term{}
 | |
| 	_ graph.Edge = (*Statement)(nil)
 | |
| 	_ graph.Line = (*Statement)(nil)
 | |
| )
 | |
| 
 | |
| var (
 | |
| 	ErrInvalid        = errors.New("invalid N-Quad")
 | |
| 	ErrIncomplete     = errors.New("incomplete N-Quad")
 | |
| 	ErrInvalidTerm    = errors.New("invalid term")
 | |
| 	ErrIncompleteTerm = errors.New("incomplete term")
 | |
| )
 | |
| 
 | |
| // Kind represents the kind of an RDF term.
 | |
| type Kind int
 | |
| 
 | |
| const (
 | |
| 	// Invalid is an invalid RDF term.
 | |
| 	Invalid Kind = iota
 | |
| 
 | |
| 	// IRI is the kind of an IRI term.
 | |
| 	// https://www.w3.org/TR/n-quads/#sec-iri
 | |
| 	IRI
 | |
| 
 | |
| 	// Literal is the kind of an RDF literal.
 | |
| 	// https://www.w3.org/TR/n-quads/#sec-literals
 | |
| 	Literal
 | |
| 
 | |
| 	// Blank is the kind of an RDF blank node term.
 | |
| 	// https://www.w3.org/TR/n-quads/#BNodes
 | |
| 	Blank
 | |
| )
 | |
| 
 | |
| // Term is an RDF term. It implements the graph.Node interface.
 | |
| type Term struct {
 | |
| 	// Value is the text value of term.
 | |
| 	Value string
 | |
| 
 | |
| 	// UID is the unique ID for the term
 | |
| 	// in a collection of RDF terms.
 | |
| 	UID int64
 | |
| }
 | |
| 
 | |
| // NewBlankTerm returns a Term based on the provided RDF blank node
 | |
| // label. The label should not include the "_:" prefix. The returned
 | |
| // Term will not have the UID set.
 | |
| func NewBlankTerm(label string) (Term, error) {
 | |
| 	err := checkLabelText([]rune(label))
 | |
| 	if err != nil {
 | |
| 		return Term{}, err
 | |
| 	}
 | |
| 	return Term{Value: blankPrefix + label}, nil
 | |
| }
 | |
| 
 | |
| const blankPrefix = "_:"
 | |
| 
 | |
| func isBlank(s string) bool {
 | |
| 	return strings.HasPrefix(s, blankPrefix)
 | |
| }
 | |
| 
 | |
| // NewIRITerm returns a Term based on the provided IRI which must
 | |
| // be valid and include a scheme. The returned Term will not have
 | |
| // the UID set.
 | |
| func NewIRITerm(iri string) (Term, error) {
 | |
| 	err := checkIRIText(iri)
 | |
| 	if err != nil {
 | |
| 		return Term{}, err
 | |
| 	}
 | |
| 	return Term{Value: escape("<", iri, ">")}, nil
 | |
| }
 | |
| 
 | |
| func isIRI(s string) bool {
 | |
| 	return strings.HasPrefix(s, "<") && strings.HasSuffix(s, ">")
 | |
| }
 | |
| 
 | |
| // NewLiteralTerm returns a Term based on the literal text and an
 | |
| // optional qualifier which may either be a "@"-prefixed language
 | |
| // tag or a valid IRI. The text will be escaped if necessary and quoted,
 | |
| // and if an IRI is given it will be escaped if necessary. The returned
 | |
| // Term will not have the UID set.
 | |
| func NewLiteralTerm(text, qual string) (Term, error) {
 | |
| 	text = escape(`"`, text, `"`)
 | |
| 	if qual == "" {
 | |
| 		return Term{Value: text}, nil
 | |
| 	}
 | |
| 	if strings.HasPrefix(qual, "@") {
 | |
| 		err := checkLangText([]byte(qual))
 | |
| 		if err != nil {
 | |
| 			return Term{}, err
 | |
| 		}
 | |
| 		return Term{Value: text + qual}, nil
 | |
| 	}
 | |
| 	err := checkIRIText(qual)
 | |
| 	if err != nil {
 | |
| 		return Term{}, err
 | |
| 	}
 | |
| 	return Term{Value: text + escape("^^<", qual, ">")}, nil
 | |
| }
 | |
| 
 | |
| func checkIRIText(iri string) error {
 | |
| 	switch u, err := url.Parse(iri); {
 | |
| 	case err != nil:
 | |
| 		return err
 | |
| 	case u.Scheme == "":
 | |
| 		return fmt.Errorf("rdf: %w: relative IRI ref %q", ErrInvalidTerm, iri)
 | |
| 	default:
 | |
| 		return nil
 | |
| 	}
 | |
| }
 | |
| 
 | |
| // Parts returns the pars of the term and the kind of the term.
 | |
| // IRI node text is returned as a valid IRI with the quoting angle
 | |
| // brackets removed and escape sequences interpreted, and blank
 | |
| // nodes are stripped of the "_:" prefix.
 | |
| // When the term is a literal, qual will either be empty, an unescaped
 | |
| // IRI, or an RDF language tag prefixed with an @ symbol. The literal
 | |
| // text is returned unquoted and unescaped.
 | |
| func (t Term) Parts() (text, qual string, kind Kind, err error) {
 | |
| 	return extract([]rune(t.Value))
 | |
| }
 | |
| 
 | |
| // ID returns the value of the Term's UID field.
 | |
| func (t Term) ID() int64 { return t.UID }
 | |
| 
 | |
| // Statement is an RDF statement. It implements the graph.Edge and graph.Line
 | |
| // interfaces.
 | |
| type Statement struct {
 | |
| 	Subject   Term
 | |
| 	Predicate Term
 | |
| 	Object    Term
 | |
| 	Label     Term
 | |
| }
 | |
| 
 | |
| // String returns the RDF 1.1 N-Quad formatted statement.
 | |
| func (s *Statement) String() string {
 | |
| 	if s.Label.Value == "" {
 | |
| 		return fmt.Sprintf("%s %s %s .", s.Subject.Value, s.Predicate.Value, s.Object.Value)
 | |
| 	}
 | |
| 	return fmt.Sprintf("%s %s %s %s .", s.Subject.Value, s.Predicate.Value, s.Object.Value, s.Label.Value)
 | |
| }
 | |
| 
 | |
| // From returns the subject of the statement.
 | |
| func (s *Statement) From() graph.Node { return s.Subject }
 | |
| 
 | |
| // To returns the object of the statement.
 | |
| func (s *Statement) To() graph.Node { return s.Object }
 | |
| 
 | |
| // ID returns the UID of the Predicate field.
 | |
| func (s *Statement) ID() int64 { return s.Predicate.UID }
 | |
| 
 | |
| // ReversedEdge returns the receiver unaltered. If there is a semantically
 | |
| // valid edge reversal operation for the data, the user should implement
 | |
| // this by wrapping Statement in a type performing that operation.
 | |
| // See the ReversedLine example for details.
 | |
| func (s *Statement) ReversedEdge() graph.Edge { return s }
 | |
| 
 | |
| // ReversedLine returns the receiver unaltered. If there is a semantically
 | |
| // valid line reversal operation for the data, the user should implement
 | |
| // this by wrapping Statement in a type performing that operation.
 | |
| func (s *Statement) ReversedLine() graph.Line { return s }
 | |
| 
 | |
| // ParseNQuad parses the statement and returns the corresponding Statement.
 | |
| // All Term UID fields are zero on return.
 | |
| func ParseNQuad(statement string) (*Statement, error) {
 | |
| 	s, err := parse([]rune(statement))
 | |
| 	if err != nil {
 | |
| 		return nil, err
 | |
| 	}
 | |
| 	return &s, err
 | |
| }
 | |
| 
 | |
| // Decoder is an RDF stream decoder. Statements returned by calls to the
 | |
| // Unmarshal method have their Terms' UID fields set so that unique terms
 | |
| // will have unique IDs and so can be used directly in a graph.Multi, or
 | |
| // in a graph.Graph if all predicate terms are identical. IDs created by
 | |
| // the decoder all exist within a single namespace and so Terms can be
 | |
| // uniquely identified by their UID. Term UIDs are based from 1 to allow
 | |
| // RDF-aware client graphs to assign ID if no ID has been assigned.
 | |
| type Decoder struct {
 | |
| 	scanner *bufio.Scanner
 | |
| 
 | |
| 	strings store
 | |
| 	ids     map[string]int64
 | |
| }
 | |
| 
 | |
| // NewDecoder returns a new Decoder that takes input from r.
 | |
| func NewDecoder(r io.Reader) *Decoder {
 | |
| 	return &Decoder{
 | |
| 		scanner: bufio.NewScanner(r),
 | |
| 		strings: make(store),
 | |
| 		ids:     make(map[string]int64),
 | |
| 	}
 | |
| }
 | |
| 
 | |
| // Reset resets the decoder to use the provided io.Reader, retaining
 | |
| // the existing Term ID mapping.
 | |
| func (dec *Decoder) Reset(r io.Reader) {
 | |
| 	dec.scanner = bufio.NewScanner(r)
 | |
| 	dec.strings = make(store)
 | |
| 	if dec.ids == nil {
 | |
| 		dec.ids = make(map[string]int64)
 | |
| 	}
 | |
| }
 | |
| 
 | |
| // Unmarshal returns the next statement from the input stream.
 | |
| func (dec *Decoder) Unmarshal() (*Statement, error) {
 | |
| 	for dec.scanner.Scan() {
 | |
| 		data := bytes.TrimSpace(dec.scanner.Bytes())
 | |
| 		if len(data) == 0 || data[0] == '#' {
 | |
| 			continue
 | |
| 		}
 | |
| 
 | |
| 		s, err := ParseNQuad(string(data))
 | |
| 		if err != nil {
 | |
| 			return nil, fmt.Errorf("rdf: failed to parse %q: %w", data, err)
 | |
| 		}
 | |
| 		if s == nil {
 | |
| 			continue
 | |
| 		}
 | |
| 
 | |
| 		s.Subject.Value = dec.strings.intern(s.Subject.Value)
 | |
| 		s.Predicate.Value = dec.strings.intern(s.Predicate.Value)
 | |
| 		s.Object.Value = dec.strings.intern(s.Object.Value)
 | |
| 		s.Subject.UID = dec.idFor(s.Subject.Value)
 | |
| 		s.Object.UID = dec.idFor(s.Object.Value)
 | |
| 		s.Predicate.UID = dec.idFor(s.Predicate.Value)
 | |
| 		if s.Label.Value != "" {
 | |
| 			s.Label.Value = dec.strings.intern(s.Label.Value)
 | |
| 			s.Label.UID = dec.idFor(s.Label.Value)
 | |
| 		}
 | |
| 		return s, nil
 | |
| 	}
 | |
| 	dec.strings = nil
 | |
| 	err := dec.scanner.Err()
 | |
| 	if err != nil {
 | |
| 		return nil, err
 | |
| 	}
 | |
| 	return nil, io.EOF
 | |
| }
 | |
| 
 | |
| func (dec *Decoder) idFor(s string) int64 {
 | |
| 	id, ok := dec.ids[s]
 | |
| 	if ok {
 | |
| 		return id
 | |
| 	}
 | |
| 	id = int64(len(dec.ids)) + 1
 | |
| 	dec.ids[s] = id
 | |
| 	return id
 | |
| }
 | |
| 
 | |
| // Terms returns the mapping between terms and graph node IDs constructed
 | |
| // during decoding the RDF statement stream.
 | |
| func (dec *Decoder) Terms() map[string]int64 {
 | |
| 	return dec.ids
 | |
| }
 | |
| 
 | |
| // store is a string internment implementation.
 | |
| type store map[string]string
 | |
| 
 | |
| // intern returns an interned version of the parameter.
 | |
| func (is store) intern(s string) string {
 | |
| 	if s == "" {
 | |
| 		return ""
 | |
| 	}
 | |
| 
 | |
| 	if len(s) < 2 || len(s) > 512 {
 | |
| 		// Not enough benefit on average with real data.
 | |
| 		return s
 | |
| 	}
 | |
| 
 | |
| 	t, ok := is[s]
 | |
| 	if ok {
 | |
| 		return t
 | |
| 	}
 | |
| 	is[s] = s
 | |
| 	return s
 | |
| }
 | |
| 
 | |
| func escape(lq, s, rq string) string {
 | |
| 	var buf strings.Builder
 | |
| 	if lq != "" {
 | |
| 		buf.WriteString(lq)
 | |
| 	}
 | |
| 	for _, r := range s {
 | |
| 		var c byte
 | |
| 		switch r {
 | |
| 		case '\n':
 | |
| 			c = 'n'
 | |
| 		case '\r':
 | |
| 			c = 'r'
 | |
| 		case '"', '\\':
 | |
| 			c = byte(r)
 | |
| 		default:
 | |
| 			const hex = "0123456789abcdef"
 | |
| 			switch {
 | |
| 			case r <= unicode.MaxASCII || strconv.IsPrint(r):
 | |
| 				buf.WriteRune(r)
 | |
| 			case r > utf8.MaxRune:
 | |
| 				r = 0xFFFD
 | |
| 				fallthrough
 | |
| 			case r < 0x10000:
 | |
| 				buf.WriteString("\\u")
 | |
| 				for s := 12; s >= 0; s -= 4 {
 | |
| 					buf.WriteByte(hex[r>>uint(s)&0xf])
 | |
| 				}
 | |
| 			default:
 | |
| 				buf.WriteString("\\U")
 | |
| 				for s := 28; s >= 0; s -= 4 {
 | |
| 					buf.WriteByte(hex[r>>uint(s)&0xf])
 | |
| 				}
 | |
| 			}
 | |
| 			continue
 | |
| 		}
 | |
| 		buf.Write([]byte{'\\', c})
 | |
| 	}
 | |
| 	if rq != "" {
 | |
| 		buf.WriteString(rq)
 | |
| 	}
 | |
| 	return buf.String()
 | |
| }
 | |
| 
 | |
| func unEscape(r []rune) string {
 | |
| 	var buf strings.Builder
 | |
| 	for i := 0; i < len(r); {
 | |
| 		switch r[i] {
 | |
| 		case '\\':
 | |
| 			i++
 | |
| 			var c byte
 | |
| 			switch r[i] {
 | |
| 			case 't':
 | |
| 				c = '\t'
 | |
| 			case 'b':
 | |
| 				c = '\b'
 | |
| 			case 'n':
 | |
| 				c = '\n'
 | |
| 			case 'r':
 | |
| 				c = '\r'
 | |
| 			case 'f':
 | |
| 				c = '\f'
 | |
| 			case '"':
 | |
| 				c = '"'
 | |
| 			case '\\':
 | |
| 				c = '\\'
 | |
| 			case '\'':
 | |
| 				c = '\''
 | |
| 			case 'u':
 | |
| 				rc, err := strconv.ParseInt(string(r[i+1:i+5]), 16, 32)
 | |
| 				if err != nil {
 | |
| 					panic(fmt.Errorf("internal parser error: %w", err))
 | |
| 				}
 | |
| 				buf.WriteRune(rune(rc))
 | |
| 				i += 5
 | |
| 				continue
 | |
| 			case 'U':
 | |
| 				rc, err := strconv.ParseInt(string(r[i+1:i+9]), 16, 32)
 | |
| 				if err != nil {
 | |
| 					panic(fmt.Errorf("internal parser error: %w", err))
 | |
| 				}
 | |
| 				buf.WriteRune(rune(rc))
 | |
| 				i += 9
 | |
| 				continue
 | |
| 			}
 | |
| 			buf.WriteByte(c)
 | |
| 		default:
 | |
| 			buf.WriteRune(r[i])
 | |
| 		}
 | |
| 		i++
 | |
| 	}
 | |
| 
 | |
| 	return buf.String()
 | |
| }
 | 
