diff --git a/pom.xml b/pom.xml index 9acdc5bf..b6d22861 100644 --- a/pom.xml +++ b/pom.xml @@ -5,7 +5,7 @@ 4.0.0 org.htmlunit neko-htmlunit - 4.16.0 + 4.17.0-rschwietzke HtmlUnit NekoHtml HtmlUnit @@ -339,4 +339,4 @@ Sean Smith - \ No newline at end of file + diff --git a/src/main/java/org/htmlunit/cyberneko/HTMLConfiguration.java b/src/main/java/org/htmlunit/cyberneko/HTMLConfiguration.java index 277b8bef..ed219e3d 100644 --- a/src/main/java/org/htmlunit/cyberneko/HTMLConfiguration.java +++ b/src/main/java/org/htmlunit/cyberneko/HTMLConfiguration.java @@ -126,6 +126,7 @@ public class HTMLConfiguration extends ParserConfigurationSettings implements XM /** Namespace binder. */ private final NamespaceBinder namespaceBinder_ = new NamespaceBinder(this); + /** HTML elements to be used */ private final HTMLElements htmlElements_; /** Default constructor. */ diff --git a/src/main/java/org/htmlunit/cyberneko/HTMLElements.java b/src/main/java/org/htmlunit/cyberneko/HTMLElements.java index b03b36be..3d1b0880 100644 --- a/src/main/java/org/htmlunit/cyberneko/HTMLElements.java +++ b/src/main/java/org/htmlunit/cyberneko/HTMLElements.java @@ -15,18 +15,21 @@ */ package org.htmlunit.cyberneko; -import java.util.HashMap; import java.util.Locale; import org.htmlunit.cyberneko.util.FastHashMap; /** - * Collection of HTML element information. + * Collection of HTML element information. Parts if it was in the original + * HtmlElements, now HtmlElementsCollection. It has been changed to allow + * modifications at runtime in the sense of caching any lookup results, + * especially for not found elements to speed things up. * * @author Andy Clark * @author Ahmed Ashour * @author Marc Guillemot * @author Ronald Brill + * @author Rene Schwietzke */ public class HTMLElements { @@ -35,6 +38,8 @@ public class HTMLElements { // NOTE: The element codes *must* start with 0 and increment in // sequence. The parent and closes references depends on // this assumption. -Ac + // Note 2: This codes are here to maintain compatibility with + // existing code despite having the main use in HtmlElementsCollection. public static final short A = 0; public static final short ABBR = A + 1; @@ -183,26 +188,43 @@ public class HTMLElements { public static final short XML = WBR + 1; public static final short XMP = XML + 1; public static final short UNKNOWN = XMP + 1; - - // information - - /** No such element. */ - public final Element NO_SUCH_ELEMENT = new Element(UNKNOWN, "", Element.CONTAINER, new short[]{BODY}, null); - - // these fields became private to avoid exposing them for indirect modification - // this cannot be final because HtmlUnit might add to that - private Element[] elementsByCode_; - - // keep the list here for later modification - private final HashMap elementsByNameForReference_ = new HashMap<>(); - - // this is a optimized version which will be later queried - private final FastHashMap elementsByNameOptimized_ = new FastHashMap<>(311, 0.70f); + + // holds the original elements to query against + private final HTMLElementsCollection htmlElementsCollection_; // this map helps us to know what elements we don't have and speed things up - private final FastHashMap unknownElements_ = new FastHashMap<>(11, 0.70f); + private final FastHashMap unknownElements_ = new FastHashMap<>(17, 0.50f); + /** + * Create a new lookup instance based on the default elements. This instance + * features a cache for not found elements to speed things up. + * + * @param htmlElements the elements to lookup against + */ public HTMLElements() { + this.htmlElementsCollection_ = HTMLElementsCollection.DEFAULT; + } + + /** + * Create a new lookup instance based on the given elements. This instance + * features a cache for not found elements to speed things up. + * + * @param htmlElements the elements to lookup against + */ + public HTMLElements(final HTMLElementsCollection htmlElements) { + this.htmlElementsCollection_ = htmlElements; + } + + /** + * Creates the default setup of HTML elements. We cannot statically set + * them up because some elements refer to others and these references + * might be changed later when adding custom elements. + * + * This is not for public use. + * + * @return the array of arrays of default elements + */ + static Element[][] setupDefaultHTMElements() { final Element[][] elementsArray = new Element[26][]; // // @@ -550,102 +572,65 @@ public HTMLElements() { // XMP new Element(XMP, "XMP", Element.SPECIAL, BODY, new short[] {P}), }; - - // keep contiguous list of elements for lookups by code - for (final Element[] elements : elementsArray) { - if (elements != null) { - for (final Element element : elements) { - this.elementsByNameForReference_.put(element.name, element); - } - } - } - - // setup optimized versions - setupOptimizedVersions(); - } - - public void setElement(final Element element) { - this.elementsByNameForReference_.put(element.name, element); - - // rebuild the information "trees" - setupOptimizedVersions(); - } - - private void setupOptimizedVersions() { - // we got x amount of elements + 1 unknown - // put that into an array instead of a map, that - // is a faster look up and avoids equals - // ATTENTION: Due to some HtmlUnit custom tag handling that overwrites our - // list here, we might get a list with holes, so check the range first - final int size = elementsByNameForReference_.values().stream().mapToInt(v -> v.code).max().getAsInt(); - elementsByCode_ = new Element[Math.max(size, NO_SUCH_ELEMENT.code) + 1]; - elementsByNameForReference_.values().forEach(v -> elementsByCode_[v.code] = v); - elementsByCode_[NO_SUCH_ELEMENT.code] = NO_SUCH_ELEMENT; - - // initialize cross references to parent elements - for (final Element element : elementsByCode_) { - if (element != null) { - defineParents(element); - } - } - // get us a second version that is lowercase stringified to - // reduce lookup overhead - for (final Element element : elementsByCode_) { - // we might have holes due to HtmlUnitNekoHtmlParser - if (element != null) { - elementsByNameOptimized_.put(element.name.toLowerCase(Locale.ROOT), element); - } - } - } - - private void defineParents(final Element element) { - if (element.parentCodes_ != null) { - element.parent = new Element[element.parentCodes_.length]; - for (int j = 0; j < element.parentCodes_.length; j++) { - element.parent[j] = elementsByCode_[element.parentCodes_[j]]; - } - element.parentCodes_ = null; - } + + return elementsArray; } /** - * @return the element information for the specified element code. + * Lookup table for elements by code. There is not range check applied + * for the sake of performance. Java will check that anyway, so we + * don't have to. * * @param code The element code. + * @return the element information for the specified element code. */ public final Element getElement(final short code) { - return elementsByCode_[code]; + return this.htmlElementsCollection_.getElement(code); } /** - * @return the element information for the specified element name. - * + * Finds an element by its name and returns the element information. If + * the element is not known, a new element with the UNKNOWN code is + * returned. + * * @param ename The element name. + * @return the element information for the specified element name or an new instance with the + * UNKNOWN code if not found. */ public final Element getElement(final String ename) { - Element element = getElement(ename, NO_SUCH_ELEMENT); - if (element == NO_SUCH_ELEMENT) { - element = new Element(UNKNOWN, ename.toUpperCase(Locale.ROOT), - NO_SUCH_ELEMENT.flags, NO_SUCH_ELEMENT.parentCodes_, NO_SUCH_ELEMENT.closes); - element.parent = NO_SUCH_ELEMENT.parent; - element.parentCodes_ = NO_SUCH_ELEMENT.parentCodes_; + Element element = getElement(ename, this.htmlElementsCollection_.NO_SUCH_ELEMENT); + + if (element == this.htmlElementsCollection_.NO_SUCH_ELEMENT) { + element = new Element(UNKNOWN, + ename.toUpperCase(Locale.ROOT), + this.htmlElementsCollection_.NO_SUCH_ELEMENT.flags, + this.htmlElementsCollection_.NO_SUCH_ELEMENT.parentCodes_, + this.htmlElementsCollection_.NO_SUCH_ELEMENT.closes); + element.parent = this.htmlElementsCollection_.NO_SUCH_ELEMENT.parent; + element.parentCodes_ = this.htmlElementsCollection_.NO_SUCH_ELEMENT.parentCodes_; } + return element; } - /** - * @return the element information for the specified element name. - * + * Looks up an element by its name and returns the element information. If the + * current form of the name is not found, it tries again with the lowercase form. + * If still not found, the given default element is returned. + * * @param ename The element name. * @param element The default element to return if not found. + * + * @return the element information for the specified element name. */ public final Element getElement(final String ename, final Element element) { // check the current form casing first, which is mostly lowercase only - Element r = elementsByNameOptimized_.get(ename); + Element r = this.htmlElementsCollection_.lookupElement(ename); + + // we have not found it, so it might feature different casing if (r == null) { // check first if we know that we don't know and avoid the // lowercasing later - if (unknownElements_.get(ename) != null) { + if (this.unknownElements_.get(ename) != null) { // we added it to the cache, so we know it has been // queried once unsuccessfully before return element; @@ -656,14 +641,15 @@ public final Element getElement(final String ename, final Element element) { // good HTML is mostly all lowercase in the first place so this is the // fallback for atypical HTML // we also have not seen that element missing yet - r = elementsByNameOptimized_.get(ename.toLowerCase(Locale.ROOT)); + r = this.htmlElementsCollection_.lookupElement(ename.toLowerCase(Locale.ROOT)); // remember that we had a miss if (r == null) { - unknownElements_.put(ename, Boolean.TRUE); + this.unknownElements_.put(ename, Boolean.TRUE); return element; } } + return r; } @@ -698,6 +684,12 @@ public static class Element { /** The element code. */ public final short code; + /** + * hash code base on the name, kept here to avoid one extra memory + * access as well as the extra logic in String.hashCode() + */ + private final int hashCode_; + /** The element name. */ public final String name; @@ -707,7 +699,10 @@ public static class Element { /** Informational flags. */ public final int flags; - /** Parent elements. */ + /** + * Parent elements, having that open is dangerous but for + * legacy reasons we won't change that right now + */ public Element[] parent; /** The bounding element code. */ @@ -718,7 +713,7 @@ public static class Element { /** Parent elements. */ short[] parentCodes_; - + /** * Constructs an element object. * @@ -782,6 +777,9 @@ public Element(final short code, final String name, final int flags, this.parent = null; this.bounds = bounds; this.closes = closes; + // the name is never null, if so, the lowercase would have blown + // up already + this.hashCode_ = name.hashCode(); } /** @@ -844,11 +842,11 @@ public boolean closes(final short tag) { } /** - * @return a hash code for this object. + * @return the hash code for this object. */ @Override public int hashCode() { - return name.hashCode(); + return this.hashCode_; } /** @@ -858,7 +856,7 @@ public int hashCode() { public boolean equals(final Object o) { if (o instanceof Element) { final Element e = (Element) o; - return lowercaseName.equals(e.name) || name.equals(e.name); + return name.equals(e.name) || lowercaseName.equals(e.name); } return false; } diff --git a/src/main/java/org/htmlunit/cyberneko/HTMLElementsCollection.java b/src/main/java/org/htmlunit/cyberneko/HTMLElementsCollection.java new file mode 100644 index 00000000..247a1ae7 --- /dev/null +++ b/src/main/java/org/htmlunit/cyberneko/HTMLElementsCollection.java @@ -0,0 +1,165 @@ +/* + * Copyright (c) 2002-2009 Andy Clark, Marc Guillemot + * Copyright (c) 2017-2024 Ronald Brill + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * https://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.htmlunit.cyberneko; + +import java.util.Collections; +import java.util.HashMap; +import java.util.List; +import java.util.Locale; + +import org.htmlunit.cyberneko.HTMLElements.Element; +import org.htmlunit.cyberneko.util.FastHashMap; + +/** + * Collection of HTML element information. This is an immutable object + * so it is safe to share between multiple threads. But one must not + * continue to use any custom elements after adding it to this collection. + * + * This class was originally in parts HtmlElements. To allow reuse and safe + * use in a concurrent environment, it was split into this class and the new + * HtmlElements. In parts to make the API less breaking. + * + * @author Andy Clark + * @author Ahmed Ashour + * @author Marc Guillemot + * @author Ronald Brill + * @author Rene Schwietzke + */ +public class HTMLElementsCollection { + + /** A default reusable instance */ + public static final HTMLElementsCollection DEFAULT = new HTMLElementsCollection(); + + /** No such element. */ + public final Element NO_SUCH_ELEMENT = new Element(HTMLElements.UNKNOWN, "", Element.CONTAINER, new short[]{HTMLElements.BODY}, null); + + // information + + // these fields became private to avoid exposing them for indirect modification + // It cannot be final because we know only later the needed size. + private Element[] elementsByCode_; + + // keep the list here for later modification + private final HashMap elementsByNameForReference_ = new HashMap<>(); + + // this is a optimized version which will be later queried, sparsely populated to avoid too many collisions + private final FastHashMap elementsByNameOptimized_ = new FastHashMap<>(311, 0.50f); + + /** + * Creates a new HTMLElements with all default objects only. If you need that, + * use the static {@link #DEFAULT} instance. + */ + private HTMLElementsCollection() { + this(Collections.emptyList()); + } + + /** + * Creates a new HTMLElements with one extra custom element. + * + * @param customElement our custom element to add + */ + public HTMLElementsCollection(Element customElement) { + this(Collections.singletonList(customElement)); + } + + /** + * Creates a new HTMLElements with all default objects plus our custom ones. + */ + public HTMLElementsCollection(List customElements) { + final Element[][] elementsArray = HTMLElements.setupDefaultHTMElements(); + + // keep contiguous list of elements for lookups by name + for (final Element[] elements : elementsArray) { + if (elements != null) { + for (final Element element : elements) { + this.elementsByNameForReference_.put(element.name, element); + } + } + } + // add our custom elements + for (final Element customElement : customElements) { + this.elementsByNameForReference_.put(customElement.name, customElement); + } + + // setup an optimized versions with all references to parents and + // some optimized lookup structures + setupOptimizedVersions(); + } + + private void setupOptimizedVersions() { + // we got x amount of elements + 1 unknown + // put that into an array instead of a map, that + // is a faster look up and avoids equals + + // ATTENTION: Due to some HtmlUnit custom tag handling that overwrites our + // list here, we might get a list with holes, so check the range first + final int size = elementsByNameForReference_.values().stream().mapToInt(v -> v.code).max().getAsInt(); + + elementsByCode_ = new Element[Math.max(size, NO_SUCH_ELEMENT.code) + 1]; + elementsByNameForReference_.values().forEach(v -> elementsByCode_[v.code] = v); + elementsByCode_[NO_SUCH_ELEMENT.code] = NO_SUCH_ELEMENT; + + // initialize cross references to parent elements + for (final Element element : elementsByCode_) { + if (element != null) { + defineParents(element); + } + } + + // get us a second version that is lowercase to + // reduce lookup overhead + for (final Element element : elementsByCode_) { + // we might have holes due to HtmlUnitNekoHtmlParser + if (element != null) { + elementsByNameOptimized_.put(element.name.toLowerCase(Locale.ROOT), element); + } + } + } + + private void defineParents(final Element element) { + if (element.parentCodes_ != null) { + element.parent = new Element[element.parentCodes_.length]; + + for (int j = 0; j < element.parentCodes_.length; j++) { + element.parent[j] = elementsByCode_[element.parentCodes_[j]]; + } + + element.parentCodes_ = null; + } + } + + /** + * Lookup table for elements by code. There is no range check applied + * for the sake of performance. Java will check that anyway, so we + * don't have to. + * + * @param code The element code. + * @return the element information for the specified element code. + */ + final Element getElement(final short code) { + return elementsByCode_[code]; + } + + /** + * Lookup the element by name, returns null if not found. + * + * @param ename the name of the element to lookup + * @return the element or null if not found + */ + final Element lookupElement(final String ename) { + return elementsByNameOptimized_.get(ename); + } +}