diff --git a/src/HtmlAgilityPack.Shared/HtmlDocument.cs b/src/HtmlAgilityPack.Shared/HtmlDocument.cs index 2871d40..37fa647 100644 --- a/src/HtmlAgilityPack.Shared/HtmlDocument.cs +++ b/src/HtmlAgilityPack.Shared/HtmlDocument.cs @@ -194,11 +194,18 @@ public static bool DisableBehaviorTagP /// public bool OptionWriteEmptyNodes; - #endregion + /// + /// The max number of nested child nodes. + /// Added to prevent stackoverflow problem when a page has tens of thousands of opening html tags with no closing tags + /// + public int OptionMaxNestedChildNodes = 0; + + + #endregion - #region Static Members + #region Static Members - internal static readonly string HtmlExceptionRefNotChild = "Reference node must be a child of this node"; + internal static readonly string HtmlExceptionRefNotChild = "Reference node must be a child of this node"; internal static readonly string HtmlExceptionUseIdAttributeFalse = "You need to set UseIdAttribute property to true to enable this feature"; @@ -1093,7 +1100,7 @@ private string CurrentNodeName() private void DecrementPosition() { _index--; - if (_lineposition == 1) + if (_lineposition == 0) { _lineposition = _maxlineposition; _line--; @@ -1190,7 +1197,7 @@ private void IncrementPosition() _maxlineposition = _lineposition; if (_c == 10) { - _lineposition = 1; + _lineposition = 0; _line++; } else @@ -1228,7 +1235,7 @@ private bool NewCheck() break; case ParseState.BetweenAttributes: - PushAttributeNameStart(_index - 1); + PushAttributeNameStart(_index - 1, _lineposition -1); break; case ParseState.WhichTag: @@ -1255,7 +1262,7 @@ private bool NewCheck() { if (Text[_index] == '!') { - PushNodeStart(HtmlNodeType.Comment, _index - 1); + PushNodeStart(HtmlNodeType.Comment, _index - 1, _lineposition -1); PushNodeNameStart(true, _index); PushNodeNameEnd(_index + 1); _state = ParseState.Comment; @@ -1276,7 +1283,7 @@ private bool NewCheck() } } - PushNodeStart(HtmlNodeType.Element, _index - 1); + PushNodeStart(HtmlNodeType.Element, _index - 1, _lineposition - 1); return true; } @@ -1298,8 +1305,8 @@ private void Parse() _fullcomment = false; _parseerrors = new List(); _line = 1; - _lineposition = 1; - _maxlineposition = 1; + _lineposition = 0; + _maxlineposition = 0; _state = ParseState.Text; _oldstate = _state; @@ -1312,7 +1319,7 @@ private void Parse() _currentattribute = null; _index = 0; - PushNodeStart(HtmlNodeType.Text, 0); + PushNodeStart(HtmlNodeType.Text, 0, _lineposition); while (_index < Text.Length) { _c = Text[_index]; @@ -1391,7 +1398,7 @@ private void Parse() if (_state != ParseState.Tag) continue; _state = ParseState.Text; - PushNodeStart(HtmlNodeType.Text, _index); + PushNodeStart(HtmlNodeType.Text, _index, _lineposition); } break; @@ -1421,11 +1428,11 @@ private void Parse() if (_state != ParseState.BetweenAttributes) continue; _state = ParseState.Text; - PushNodeStart(HtmlNodeType.Text, _index); + PushNodeStart(HtmlNodeType.Text, _index, _lineposition); continue; } - PushAttributeNameStart(_index - 1); + PushAttributeNameStart(_index - 1, _lineposition -1); _state = ParseState.AttributeName; break; @@ -1445,7 +1452,7 @@ private void Parse() if (_state != ParseState.EmptyTag) continue; _state = ParseState.Text; - PushNodeStart(HtmlNodeType.Text, _index); + PushNodeStart(HtmlNodeType.Text, _index, _lineposition); continue; } @@ -1496,7 +1503,7 @@ private void Parse() if (_state != ParseState.AttributeName) continue; _state = ParseState.Text; - PushNodeStart(HtmlNodeType.Text, _index); + PushNodeStart(HtmlNodeType.Text, _index, _lineposition); continue; } @@ -1520,7 +1527,7 @@ private void Parse() if (_state != ParseState.AttributeBeforeEquals) continue; _state = ParseState.Text; - PushNodeStart(HtmlNodeType.Text, _index); + PushNodeStart(HtmlNodeType.Text, _index, _lineposition); continue; } @@ -1562,7 +1569,7 @@ private void Parse() if (_state != ParseState.AttributeAfterEquals) continue; _state = ParseState.Text; - PushNodeStart(HtmlNodeType.Text, _index); + PushNodeStart(HtmlNodeType.Text, _index, _lineposition); continue; } @@ -1594,7 +1601,7 @@ private void Parse() if (_state != ParseState.AttributeValue) continue; _state = ParseState.Text; - PushNodeStart(HtmlNodeType.Text, _index); + PushNodeStart(HtmlNodeType.Text, _index, _lineposition); continue; } @@ -1645,7 +1652,7 @@ private void Parse() } _state = ParseState.Text; - PushNodeStart(HtmlNodeType.Text, _index); + PushNodeStart(HtmlNodeType.Text, _index, _lineposition); continue; } @@ -1706,11 +1713,11 @@ private void Parse() script._outerlength = _index - 1 - script._outerstartindex; script._streamposition = script._outerstartindex; script._line = _currentnode.Line; - script._lineposition = _currentnode.LinePosition + _currentnode._namelength + 2; - _currentnode.AppendChild(script); + script._lineposition = _currentnode.LinePosition + _currentnode._namelength + 2; + _currentnode.AppendChild(script); - PushNodeStart(HtmlNodeType.Element, _index - 1); + PushNodeStart(HtmlNodeType.Element, _index - 1, _lineposition -1); PushNodeNameStart(false, _index - 1 + 2); _state = ParseState.Tag; IncrementPosition(); @@ -1743,12 +1750,12 @@ private void PushAttributeNameEnd(int index) _currentnode.Attributes.Append(_currentattribute); } - private void PushAttributeNameStart(int index) + private void PushAttributeNameStart(int index, int lineposition) { _currentattribute = CreateAttribute(); _currentattribute._namestartindex = index; _currentattribute.Line = _line; - _currentattribute._lineposition = _lineposition; + _currentattribute._lineposition = lineposition; _currentattribute._streamposition = index; } @@ -2056,16 +2063,11 @@ private void PushNodeNameStart(bool starttag, int index) _currentnode._namestartindex = index; } - private void PushNodeStart(HtmlNodeType type, int index) + private void PushNodeStart(HtmlNodeType type, int index, int lineposition) { _currentnode = CreateNode(type, index); _currentnode._line = _line; - _currentnode._lineposition = _lineposition; - if (type == HtmlNodeType.Element) - { - _currentnode._lineposition--; - } - + _currentnode._lineposition = lineposition; _currentnode._streamposition = index; } diff --git a/src/HtmlAgilityPack.Shared/HtmlNode.cs b/src/HtmlAgilityPack.Shared/HtmlNode.cs index cfceff1..5be7971 100644 --- a/src/HtmlAgilityPack.Shared/HtmlNode.cs +++ b/src/HtmlAgilityPack.Shared/HtmlNode.cs @@ -618,6 +618,12 @@ public string XPath } } + + /// + /// The depth of the node relative to the opening root html element. This value is used to determine if a document has to many nested html nodes which can cause stack overflows + /// + public int Depth { get; set; } + #endregion #region Public Methods @@ -1859,6 +1865,23 @@ public string WriteTo() } } + /// + /// Sets the parent Html node and properly determines the current node's depth using the parent node's depth. + /// + public void SetParent(HtmlNode parent) + { + if (parent == null) + return; + + ParentNode = parent; + if (OwnerDocument.OptionMaxNestedChildNodes > 0) + { + Depth = parent.Depth + 1; + if (Depth > OwnerDocument.OptionMaxNestedChildNodes) + throw new Exception(string.Format("Document has more than {0} nested tags. This is likely due to the page not closing tags properly.", OwnerDocument.OptionMaxNestedChildNodes)); + } + } + #endregion #region Internal Methods diff --git a/src/HtmlAgilityPack.Shared/HtmlNodeCollection.cs b/src/HtmlAgilityPack.Shared/HtmlNodeCollection.cs index ca194ed..43df9a2 100644 --- a/src/HtmlAgilityPack.Shared/HtmlNodeCollection.cs +++ b/src/HtmlAgilityPack.Shared/HtmlNodeCollection.cs @@ -220,9 +220,9 @@ public void Insert(int index, HtmlNode node) if (next == node) throw new InvalidProgramException("Unexpected error."); - node._nextnode = next; - node._parentnode = _parentnode; - } + node._nextnode = next; + node.SetParent(_parentnode); + } /// /// Remove node @@ -315,8 +315,8 @@ public void Append(HtmlNode node) _items.Add(node); node._prevnode = last; node._nextnode = null; - node._parentnode = _parentnode; - if (last == null) return; + node.SetParent(_parentnode); + if (last == null) return; if (last == node) throw new InvalidProgramException("Unexpected error."); @@ -363,9 +363,9 @@ public void Prepend(HtmlNode node) throw new InvalidProgramException("Unexpected error."); node._nextnode = first; node._prevnode = null; - node._parentnode = _parentnode; + node.SetParent(_parentnode); - if (first != null) + if (first != null) first._prevnode = node; } @@ -415,9 +415,9 @@ public void Replace(int index, HtmlNode node) throw new InvalidProgramException("Unexpected error."); node._nextnode = next; - node._parentnode = _parentnode; + node.SetParent(_parentnode); - oldnode._prevnode = null; + oldnode._prevnode = null; oldnode._nextnode = null; oldnode._parentnode = null; }