// Copyright ©2020 The Gonum Authors. All rights reserved. // Use of this source code is governed by a BSD-style // license that can be found in the LICENSE file. //go:generate ragel -Z -G2 parse.rl //go:generate ragel -Z -G2 extract.rl //go:generate ragel -Z -G2 check.rl //go:generate stringer -type=Kind package rdf import ( "bufio" "bytes" "errors" "fmt" "io" "net/url" "strconv" "strings" "unicode" "unicode/utf8" "gonum.org/v1/gonum/graph" ) var ( _ graph.Node = Term{} _ graph.Edge = (*Statement)(nil) _ graph.Line = (*Statement)(nil) ) var ( ErrInvalid = errors.New("invalid N-Quad") ErrIncomplete = errors.New("incomplete N-Quad") ErrInvalidTerm = errors.New("invalid term") ErrIncompleteTerm = errors.New("incomplete term") ) // Kind represents the kind of an RDF term. type Kind int const ( // Invalid is an invalid RDF term. Invalid Kind = iota // IRI is the kind of an IRI term. // https://www.w3.org/TR/n-quads/#sec-iri IRI // Literal is the kind of an RDF literal. // https://www.w3.org/TR/n-quads/#sec-literals Literal // Blank is the kind of an RDF blank node term. // https://www.w3.org/TR/n-quads/#BNodes Blank ) // Term is an RDF term. It implements the graph.Node interface. type Term struct { // Value is the text value of term. Value string // UID is the unique ID for the term // in a collection of RDF terms. UID int64 } // NewBlankTerm returns a Term based on the provided RDF blank node // label. The label should not include the "_:" prefix. The returned // Term will not have the UID set. func NewBlankTerm(label string) (Term, error) { err := checkLabelText([]rune(label)) if err != nil { return Term{}, err } return Term{Value: "_:" + label}, nil } // NewIRITerm returns a Term based on the provided IRI which must // be valid and include a scheme. The returned Term will not have // the UID set. func NewIRITerm(iri string) (Term, error) { err := checkIRIText(iri) if err != nil { return Term{}, err } return Term{Value: escape("<", iri, ">")}, nil } // NewLiteralTerm returns a Term based on the literal text and an // optional qualifier which may either be a "@"-prefixed language // tag or a valid IRI. The text will be escaped if necessary and quoted, // and if an IRI is given it will be escaped if necessary. The returned // Term will not have the UID set. func NewLiteralTerm(text, qual string) (Term, error) { text = escape(`"`, text, `"`) if qual == "" { return Term{Value: text}, nil } if strings.HasPrefix(qual, "@") { err := checkLangText([]byte(qual)) if err != nil { return Term{}, err } return Term{Value: text + qual}, nil } err := checkIRIText(qual) if err != nil { return Term{}, err } return Term{Value: text + escape("^^<", qual, ">")}, nil } func checkIRIText(iri string) error { switch u, err := url.Parse(iri); { case err != nil: return err case u.Scheme == "": return fmt.Errorf("rdf: %w: relative IRI ref %q", ErrInvalidTerm, iri) default: return nil } } // Parts returns the pars of the term and the kind of the term. // IRI node text is returned as a valid IRI with the quoting angle // brackets removed and escape sequences interpreted, and blank // nodes are stripped of the "_:" prefix. // When the term is a literal, qual will either be empty, an unescaped // IRI, or an RDF language tag prefixed with an @ symbol. The literal // text is returned unquoted and unescaped. func (t Term) Parts() (text, qual string, kind Kind, err error) { return extract([]rune(t.Value)) } // ID returns the value of the Term's UID field. func (t Term) ID() int64 { return t.UID } // Statement is an RDF statement. It implements the graph.Edge and graph.Line // interfaces. type Statement struct { Subject Term Predicate Term Object Term Label Term } // String returns the RDF 1.1 N-Quad formatted statement. func (s *Statement) String() string { if s.Label.Value == "" { return fmt.Sprintf("%s %s %s .", s.Subject.Value, s.Predicate.Value, s.Object.Value) } return fmt.Sprintf("%s %s %s %s .", s.Subject.Value, s.Predicate.Value, s.Object.Value, s.Label.Value) } // From returns the subject of the statement. func (s *Statement) From() graph.Node { return s.Subject } // To returns the object of the statement. func (s *Statement) To() graph.Node { return s.Object } // ID returns the UID of the Predicate field. func (s *Statement) ID() int64 { return s.Predicate.UID } // ReversedEdge returns the receiver unaltered. If there is a semantically // valid edge reversal operation for the data, the user should implement // this by wrapping Statement in a type performing that operation. // See the ReversedLine example for details. func (s *Statement) ReversedEdge() graph.Edge { return s } // ReversedLine returns the receiver unaltered. If there is a semantically // valid line reversal operation for the data, the user should implement // this by wrapping Statement in a type performing that operation. func (s *Statement) ReversedLine() graph.Line { return s } // ParseNQuad parses the statement and returns the corresponding Statement. // All Term UID fields are zero on return. func ParseNQuad(statement string) (*Statement, error) { s, err := parse([]rune(statement)) if err != nil { return nil, err } return &s, err } // Decoder is an RDF stream decoder. Statements returned by calls to the // Unmarshal method have their Terms' UID fields set so that unique terms // will have unique IDs and so can be used directly in a graph.Multi, or // in a graph.Graph if all predicate terms are identical. IDs created by // the decoder all exist within a single namespace and so Terms can be // uniquely identified by their UID. Term UIDs are based from 1 to allow // RDF-aware client graphs to assign ID if no ID has been assigned. type Decoder struct { scanner *bufio.Scanner strings store ids map[string]int64 } // NewDecoder returns a new Decoder that takes input from r. func NewDecoder(r io.Reader) *Decoder { return &Decoder{ scanner: bufio.NewScanner(r), strings: make(store), ids: make(map[string]int64), } } // Reset resets the decoder to use the provided io.Reader, retaining // the existing Term ID mapping. func (dec *Decoder) Reset(r io.Reader) { dec.scanner = bufio.NewScanner(r) dec.strings = make(store) if dec.ids == nil { dec.ids = make(map[string]int64) } } // Unmarshal returns the next statement from the input stream. func (dec *Decoder) Unmarshal() (*Statement, error) { for dec.scanner.Scan() { data := bytes.TrimSpace(dec.scanner.Bytes()) if len(data) == 0 || data[0] == '#' { continue } s, err := ParseNQuad(string(data)) if err != nil { return nil, fmt.Errorf("rdf: failed to parse %q: %w", data, err) } if s == nil { continue } s.Subject.Value = dec.strings.intern(s.Subject.Value) s.Predicate.Value = dec.strings.intern(s.Predicate.Value) s.Object.Value = dec.strings.intern(s.Object.Value) s.Subject.UID = dec.idFor(s.Subject.Value) s.Object.UID = dec.idFor(s.Object.Value) s.Predicate.UID = dec.idFor(s.Predicate.Value) if s.Label.Value != "" { s.Label.Value = dec.strings.intern(s.Label.Value) s.Label.UID = dec.idFor(s.Label.Value) } return s, nil } dec.strings = nil err := dec.scanner.Err() if err != nil { return nil, err } return nil, io.EOF } func (dec *Decoder) idFor(s string) int64 { id, ok := dec.ids[s] if ok { return id } id = int64(len(dec.ids)) + 1 dec.ids[s] = id return id } // Terms returns the mapping between terms and graph node IDs constructed // during decoding the RDF statement stream. func (dec *Decoder) Terms() map[string]int64 { return dec.ids } // store is a string internment implementation. type store map[string]string // intern returns an interned version of the parameter. func (is store) intern(s string) string { if s == "" { return "" } if len(s) < 2 || len(s) > 512 { // Not enough benefit on average with real data. return s } t, ok := is[s] if ok { return t } is[s] = s return s } func escape(lq, s, rq string) string { var buf strings.Builder if lq != "" { buf.WriteString(lq) } for _, r := range s { var c byte switch r { case '\n': c = 'n' case '\r': c = 'r' case '"', '\\': c = byte(r) default: const hex = "0123456789abcdef" switch { case r <= unicode.MaxASCII || strconv.IsPrint(r): buf.WriteRune(r) case r > utf8.MaxRune: r = 0xFFFD fallthrough case r < 0x10000: buf.WriteString("\\u") for s := 12; s >= 0; s -= 4 { buf.WriteByte(hex[r>>uint(s)&0xf]) } default: buf.WriteString("\\U") for s := 28; s >= 0; s -= 4 { buf.WriteByte(hex[r>>uint(s)&0xf]) } } continue } buf.Write([]byte{'\\', c}) } if rq != "" { buf.WriteString(rq) } return buf.String() } func unEscape(r []rune) string { var buf strings.Builder for i := 0; i < len(r); { switch r[i] { case '\\': i++ var c byte switch r[i] { case 't': c = '\t' case 'b': c = '\b' case 'n': c = '\n' case 'r': c = '\r' case 'f': c = '\f' case '"': c = '"' case '\\': c = '\\' case '\'': c = '\'' case 'u': rc, err := strconv.ParseInt(string(r[i+1:i+5]), 16, 32) if err != nil { panic(fmt.Errorf("internal parser error: %w", err)) } buf.WriteRune(rune(rc)) i += 5 continue case 'U': rc, err := strconv.ParseInt(string(r[i+1:i+9]), 16, 32) if err != nil { panic(fmt.Errorf("internal parser error: %w", err)) } buf.WriteRune(rune(rc)) i += 9 continue } buf.WriteByte(c) default: buf.WriteRune(r[i]) } i++ } return buf.String() }