add PathForNode and NodeAtPath functions

This commit is contained in:
Martin Angers
2017-08-12 13:26:49 -04:00
parent 8806ada2a4
commit 96ed0f357a
2 changed files with 141 additions and 0 deletions

View File

@@ -18,6 +18,62 @@ var nodeNames = []string{
html.CommentNode: "#comment",
}
// PathForNode returns a unique path to retrieve the specified node
// from its document tree. The path is a slice of int indices, starting
// at the root of the tree.
func PathForNode(n *html.Node) []int {
var indices []int
for n := n; n != nil; n = n.Parent {
ix := 0
for prev := n.PrevSibling; prev != nil; prev = prev.PrevSibling {
ix++
}
indices = append(indices, ix)
}
// reverse the slice of indices
for l, r := 0, len(indices)-1; l < r; l, r = l+1, r-1 {
indices[l], indices[r] = indices[r], indices[l]
}
return indices
}
// NodeAtPath returns the HTML node at the specified path in the
// document tree of the specified n node. The path is followed from
// the root of the tree. If no node is found by following the path,
// nil is returned.
func NodeAtPath(path []int, n *html.Node) *html.Node {
if n == nil {
return n
}
// start at root
for n.Parent != nil {
n = n.Parent
}
for n.PrevSibling != nil {
n = n.PrevSibling
}
for i, ix := range path {
if i > 0 {
n = n.FirstChild
if n == nil {
return n
}
}
for j := 0; j < ix; j++ {
n = n.NextSibling
if n == nil {
return n
}
}
}
return n
}
// NodeName returns the node name of the first element in the selection.
// It tries to behave in a similar way as the DOM's nodeName property
// (https://developer.mozilla.org/en-US/docs/Web/API/Node/nodeName).

View File

@@ -9,6 +9,91 @@ import (
"golang.org/x/net/html"
)
var invalidPathNodes = []struct {
in string
path []int
}{
{"<a>", []int{0, 1, 2}},
{"<html><head><meta><title></title></head><body><div><p></p><a></a><span></span></div></body></html>", []int{0, 0, 1, 2, 0}},
{"<html><head><meta><title></title></head><body><div><p></p><a></a><span></span></div></body></html>", []int{1}},
{"<html><head><meta><title></title></head><body><div><p></p><a></a><span></span></div></body></html>", []int{1, 2}},
{"<html><head><meta><title></title></head><body><div><p></p><a></a><span></span></div></body></html>", []int{1, 2, 10}},
}
var validPathNodes = []struct {
in string
el string
path []int
}{
{"<a>", "a", []int{0, 0, 1, 0}}, // root html body(1) a
{"<html><head><meta></head><body></body></html>", "meta", []int{0, 0, 0, 0}}, // root html head meta
{"<html><head><meta><title></title></head><body></body></html>", "title", []int{0, 0, 0, 1}}, // root html head title
{"<html><head><meta><title></title></head><body><div><p></p></div></body></html>", "div", []int{0, 0, 1, 0}}, // root html body(1) div
{"<html><head><meta><title></title></head><body><div><p></p></div></body></html>", "p", []int{0, 0, 1, 0, 0}}, // root html body(1) div p
{"<html><head><meta><title></title></head><body><div><p></p><a></a><span></span></div></body></html>", "a", []int{0, 0, 1, 0, 1}}, // root html body(1) div a(1)
{"<html><head><meta><title></title></head><body><div><p></p><a></a><span></span></div></body></html>", "span", []int{0, 0, 1, 0, 2}}, // root html body(1) div span(2)
}
func TestPathForNode(t *testing.T) {
for i, c := range validPathNodes {
doc, err := NewDocumentFromReader(strings.NewReader(c.in))
if err != nil {
t.Errorf("%d: failed to parse: %v", i, err)
continue
}
var n *html.Node
if sel := doc.Find(c.el); sel.Length() > 0 {
n = sel.Get(0)
}
got := PathForNode(n)
if !reflect.DeepEqual(c.path, got) {
h, _ := OuterHtml(doc.Selection)
t.Errorf("%d: want %v, got %v (html: %s)", i, c.path, got, h)
}
}
// test a nil node
if got := PathForNode(nil); got != nil {
t.Errorf("want nil for nil node, got %v", got)
}
}
func TestNodeAtPath(t *testing.T) {
// valid cases
for i, c := range validPathNodes {
n, err := html.Parse(strings.NewReader(c.in))
if err != nil {
t.Errorf("%d: failed to parse: %v", i, err)
continue
}
nn := NodeAtPath(c.path, n)
if nn.Data != c.el {
t.Errorf("%d: want element %s, got %s (%v)", i, c.el, nn.Data, nn)
}
}
// invalid cases
for i, c := range invalidPathNodes {
n, err := html.Parse(strings.NewReader(c.in))
if err != nil {
t.Errorf("%d: failed to parse: %v", i, err)
continue
}
if got := NodeAtPath(c.path, n); got != nil {
t.Errorf("%d: want nil, got %v", i, got)
}
}
// test a nil node
if got := NodeAtPath([]int{1, 2, 3}, nil); got != nil {
t.Errorf("want nil for nil node, got %v", got)
}
}
var allNodes = `<!doctype html>
<html>
<head>