diff --git a/utilities.go b/utilities.go index b4c061a..5ff58bd 100644 --- a/utilities.go +++ b/utilities.go @@ -18,6 +18,62 @@ var nodeNames = []string{ html.CommentNode: "#comment", } +// PathForNode returns a unique path to retrieve the specified node +// from its document tree. The path is a slice of int indices, starting +// at the root of the tree. +func PathForNode(n *html.Node) []int { + var indices []int + for n := n; n != nil; n = n.Parent { + ix := 0 + for prev := n.PrevSibling; prev != nil; prev = prev.PrevSibling { + ix++ + } + indices = append(indices, ix) + } + + // reverse the slice of indices + for l, r := 0, len(indices)-1; l < r; l, r = l+1, r-1 { + indices[l], indices[r] = indices[r], indices[l] + } + return indices +} + +// NodeAtPath returns the HTML node at the specified path in the +// document tree of the specified n node. The path is followed from +// the root of the tree. If no node is found by following the path, +// nil is returned. +func NodeAtPath(path []int, n *html.Node) *html.Node { + if n == nil { + return n + } + + // start at root + for n.Parent != nil { + n = n.Parent + } + for n.PrevSibling != nil { + n = n.PrevSibling + } + + for i, ix := range path { + if i > 0 { + n = n.FirstChild + if n == nil { + return n + } + } + + for j := 0; j < ix; j++ { + n = n.NextSibling + if n == nil { + return n + } + } + } + + return n +} + // NodeName returns the node name of the first element in the selection. // It tries to behave in a similar way as the DOM's nodeName property // (https://developer.mozilla.org/en-US/docs/Web/API/Node/nodeName). diff --git a/utilities_test.go b/utilities_test.go index c8e9d54..733abab 100644 --- a/utilities_test.go +++ b/utilities_test.go @@ -9,6 +9,91 @@ import ( "golang.org/x/net/html" ) +var invalidPathNodes = []struct { + in string + path []int +}{ + {"", []int{0, 1, 2}}, + {"

", []int{0, 0, 1, 2, 0}}, + {"

", []int{1}}, + {"

", []int{1, 2}}, + {"

", []int{1, 2, 10}}, +} + +var validPathNodes = []struct { + in string + el string + path []int +}{ + {"", "a", []int{0, 0, 1, 0}}, // root html body(1) a + {"", "meta", []int{0, 0, 0, 0}}, // root html head meta + {"", "title", []int{0, 0, 0, 1}}, // root html head title + {"

", "div", []int{0, 0, 1, 0}}, // root html body(1) div + {"

", "p", []int{0, 0, 1, 0, 0}}, // root html body(1) div p + {"

", "a", []int{0, 0, 1, 0, 1}}, // root html body(1) div a(1) + {"

", "span", []int{0, 0, 1, 0, 2}}, // root html body(1) div span(2) +} + +func TestPathForNode(t *testing.T) { + for i, c := range validPathNodes { + doc, err := NewDocumentFromReader(strings.NewReader(c.in)) + if err != nil { + t.Errorf("%d: failed to parse: %v", i, err) + continue + } + + var n *html.Node + if sel := doc.Find(c.el); sel.Length() > 0 { + n = sel.Get(0) + } + + got := PathForNode(n) + if !reflect.DeepEqual(c.path, got) { + h, _ := OuterHtml(doc.Selection) + t.Errorf("%d: want %v, got %v (html: %s)", i, c.path, got, h) + } + } + + // test a nil node + if got := PathForNode(nil); got != nil { + t.Errorf("want nil for nil node, got %v", got) + } +} + +func TestNodeAtPath(t *testing.T) { + // valid cases + for i, c := range validPathNodes { + n, err := html.Parse(strings.NewReader(c.in)) + if err != nil { + t.Errorf("%d: failed to parse: %v", i, err) + continue + } + + nn := NodeAtPath(c.path, n) + if nn.Data != c.el { + t.Errorf("%d: want element %s, got %s (%v)", i, c.el, nn.Data, nn) + } + } + + // invalid cases + for i, c := range invalidPathNodes { + n, err := html.Parse(strings.NewReader(c.in)) + if err != nil { + t.Errorf("%d: failed to parse: %v", i, err) + continue + } + + if got := NodeAtPath(c.path, n); got != nil { + t.Errorf("%d: want nil, got %v", i, got) + } + } + + // test a nil node + if got := NodeAtPath([]int{1, 2, 3}, nil); got != nil { + t.Errorf("want nil for nil node, got %v", got) + } +} + var allNodes = `