Regular Expressions Difficulty

Collapse
This topic is closed.
X
X
 
  • Time
  • Show
Clear All
new posts
  • Befuddled

    Regular Expressions Difficulty


    I am writing a function to have its argument, HTML-containing string,
    return a DOM 1 Document Fragment, and so it seems the use of regular
    expressions (REs) is a natural.

    My problem is that the browsers (IE and Mozilla) that I am using to write
    and debug have a different idea about parsing strings using REs. Here is
    the starting example:

    stringPtr = "<div id=\"errblock\" style=\"color:r ed;\">" +
    "<p>This is a simple doc frag";
    elem = stringPtr.match (/<(.+)>/);

    This is only the LATEST in SEVERAL different revisions of the RE for
    'elem'. What the debugger (Venkman, but IE does the same) keeps returning
    in elem[1], the variable of interest, is a string that includes the DIV and
    the P element. I made a table of all the REs I have tried and their
    results:

    RE: /\<(.+)>\)/
    elem[0]: "<div id=\"something\ " style=\"color:r ed;\"><p>"
    elem[1]: "div id=\"something\ " style=\"color:r ed;\"><p"

    RE: /(\<.+\>)/
    elem[0]: "<div id=\"something\ " style=\"color:r ed;\"><p>"
    elem[1]: "<div id=\"something\ " style=\"color:r ed;\"><p>"

    RE: /<(\w+)>/
    elem[0]: "<p>"
    elem[1]: "p"

    RE: /<(\S+\s*\S*)>/
    elem[0]: "<p>"
    elem[1]: "p"


    All of these seem wrong to me. So long as what occurs between the '<' and
    '>' matches the criteria, the parser should return JUST the first element
    (the DIV) and look for elements that contain may or may not contain
    attributes, depending upon the RE within the parenthesized subexpression of
    the RE.

    The problem is, that is is matching on the P element, ignoring the '><'
    that occurs in between. It should not matter whether whitespace precedes
    the P element, since it is not required and browsers can make sense of it.

    My intention is to have an RE that recognizes elements with and without
    attributes, and also to deal with container text as well.






    //============== contents of dom1.js ============

    /* Note, at least half of the lines in the code are UNTESTED and
    almost certainly RIDDLED WITH ERROR and EXCEPTION, and
    so the code is likely to change, and especially to make use of
    optimizations to get around slow performance */

    var nonEtagoElement s = [ "input", "br", "img", "hr", "col", "frame",
    "meta", "link", "param", "base", "basefont" ];

    var RequiredEtagoEl ements = {
    a: [ "a" , "area", "applet", "address", "abbr", "acronym" ],
    b: [ "b", "body", "blockquote ", "big", "bdo" ],
    c: [ "center", "caption", "cite", "code" ],
    d: [ "div", "dfn", "dl", "del", "dir" ],
    e: [ "em" ],
    f: [ "form", "font", "fieldset" ],
    i: [ "i", "iframe", "ins", "inindex" ],
    k: [ "kbd" ],
    l: [ "label", "legend" ],
    m: [ "map", "menu" ],
    n: [ "noscript", "noframes" ],
    o: [ "ol", "optgroup", "object" ],
    p: [ "pre" ],
    q: [ "q" ],
    s: [ "span", "strong", "sub", "sup", "script", "select", "style",
    "small", "samp", "strike", "s" ],
    t: [ "table" , "title", "tt" ],
    u: [ "ul", "u" ],
    v: [ "var" ]
    };

    var OptionalEtagoEl ements = [ "p", "tr", "td" , "th", "li",
    "colgroup" , "option", "dd", "dt", "thead", "tfoot" ];

    var ImpliedElements = [ "tbody", "head", "html" ];

    function verifyElem(elem Str, option)
    {
    var i, j, x;
    if ((j = RequiredEtagoEl ements[x = elemStr.charAt( 0)].length) > 0)
    for (i = 0; i < j; i++)
    if (elemStr.toLowe rCase() == RequiredEtagoEl ements[x][i])
    return (true);
    for (i = 0; i < OptionalEtagoEl ements.length; i++)
    if (elemStr == OptionalEtagoEl ements[i])
    return (true);
    for (i = 0; i < ImpliedElements .length; i++)
    if (elemStr == ImpliedElements[i])
    return (true);
    if (option == 1)
    return (false);
    for (i = 0; i < nonEtagoElement s.length; i++)
    if (elemStr == nonEtagoElement s[i])
    return (true);
    return (false);
    }

    function isContainer(ele mStr)
    {
    return (verifyElem(ele mStr, 1));
    }

    function makeHTMLDocFrag (HTMLstring)
    {
    var i, j, etago, elem, elemNode, attrs, txt, tag;
    var levelTagName = new Array(25);
    var level = 0;
    if (typeof(HTMLstr ing) == "undefined" )
    return (null);
    var docFrag = document.create DocumentFragmen t();
    var levelNode = docFrag;
    var stringPtr = HTMLstring;
    debugger;
    while ((i = stringPtr.searc h(/<*\w+/)) >= 0)
    {
    if (stringPtr.char At(i) == '<')
    {
    if (stringPtr.char At(i + 1) == '/') // end tag
    {
    etago = stringPtr.match (/<\/(\S+)/);
    if (etago[1] == levelTagName[level] &&
    levelNode.paren tNode != null)
    {
    levelNode = levelNode.paren tNode;
    level--;
    }
    }
    else if (stringPtr.sear ch(/<[hH][1-6]\s+/) == 0)
    { // special case of the header
    elem = stringPtr.match (/<([hH][1-6])\s+/);
    elemNode = document.create Element(elem[1]);
    if (levelNode != null)
    levelNode.appen dChild(elemNode );
    levelTagName[level++] = elem[1];
    }
    else // element that is not header
    {
    elem = stringPtr.match (/(\<.+\>)/);
    tag = elem[1].match(/(\w+)/);
    if (verifyElem(tag ) == true)
    {
    elemNode = document.create Element(tag);
    if (levelNode != null)
    levelNode.appen dChild(elemNode );
    if (isContainer(ta g) == true)
    {
    levelNode = elemNode;
    levelTagName[level++] = tag;
    }
    if ((attrs = elem[1].match(/(\w+)=(\w+)/g)) != null)
    for (j = 1; j < attrs.length; j += 2)
    {
    attrs[j + 1] = attrs[j + 1].replace(/"/g); /* " quote
    commented out for syntax-highlighting editors */
    elemNode.setAtt ributes(attrs[j], attrs[j + 1]);
    }
    return;
    }
    }
    i = stringPtr.searc h(/>/);
    }
    else
    {
    txt = stringPtr.match (/(.*)</);
    levelNode.appen dChild(document .createTextNode (txt[1]));
    i = stringPtr.searc h(/</);
    }
    stringPtr = stringPtr.subst r(i, stringPtr.lengt h - 1);
    }
    return (docFrag);
    }
  • Martin Honnen

    #2
    Re: Regular Expressions Difficulty



    Befuddled wrote:
    [color=blue]
    > I am writing a function to have its argument, HTML-containing string,
    > return a DOM 1 Document Fragment, and so it seems the use of regular
    > expressions (REs) is a natural.[/color]

    HTML browsers have HTML parsing built in so why do you neeed regular
    expressions to parse HTML, why don't you simply create an element, set
    its innerHTML to the HTML snippet and then read out the child nodes as
    needed:
    var div = document.create Element('div');
    div.innerHTML = htmlString;
    Now build a document fragment if needed and simply move the child nodes
    of the div to the fragment if you want.

    --

    Martin Honnen

    Comment

    • Befuddled

      #3
      Re: Regular Expressions Difficulty

      Martin Honnen <mahotrash@yaho o.de> wrote in news:41baedaf$0 $16044
      $9b4e6d93@newsr ead4.arcor-online.net:
      [color=blue]
      >
      >
      > Befuddled wrote:
      >[color=green]
      >> I am writing a function to have its argument, HTML-containing string,
      >> return a DOM 1 Document Fragment, and so it seems the use of regular
      >> expressions (REs) is a natural.[/color]
      >
      > HTML browsers have HTML parsing built in so why do you neeed regular
      > expressions to parse HTML, why don't you simply create an element, set
      > its innerHTML to the HTML snippet and then read out the child nodes as
      > needed:
      > var div = document.create Element('div');
      > div.innerHTML = htmlString;[/color]

      I was avoiding the property 'innerHTML' because I did not know if it was
      standardized in DOM at any level. I am ABSOLUTELY avoiding the use of
      extensions beyond the standard (or more modestly put forth as a
      "recommendation "), no matter how many browsers have the functionality to
      interpret it, even if it is 99.999% of all browsers used on the planet.

      If 'innerHTML' is now standardized, that saves a lot of
      work/coding/function writing. Searches of the specifications for DOM
      (and JavaScript for that matter) that I have in my possession for the
      property 'innerHTML' produce ZERO results. Please provide a URL to the
      DOM and/or JavaScript specification that I am missing so that I can make
      use of that information. Thanks.
      [color=blue]
      > Now build a document fragment if needed and simply move the child nodes
      > of the div to the fragment if you want.
      >[/color]

      Comment

      • Martin Honnen

        #4
        Re: Regular Expressions Difficulty



        Befuddled wrote:
        [color=blue]
        > Martin Honnen <mahotrash@yaho o.de> wrote[/color]
        [color=blue][color=green]
        >>HTML browsers have HTML parsing built in so why do you neeed regular
        >>expressions to parse HTML, why don't you simply create an element, set
        >>its innerHTML to the HTML snippet and then read out the child nodes as
        >>needed:
        >> var div = document.create Element('div');
        >> div.innerHTML = htmlString;[/color]
        >
        >
        > I was avoiding the property 'innerHTML' because I did not know if it was
        > standardized in DOM at any level. I am ABSOLUTELY avoiding the use of
        > extensions beyond the standard (or more modestly put forth as a
        > "recommendation ")[/color]

        So you would prefer createDocumentF ragment for instance to innerHTML
        because createDocumentF ragment is in the W3C recommendation but
        innerHTML is not? For istance IE 5.5 doesn't support
        createDocumentF ragment so your code will not work there. innerHTML
        certainly has far greater support than createDocumentF ragment.
        But anyway, as for your regular expression problem, matching by default
        is greedy meaning as much as possible is matched so your expression
        correctyly consumes characters to the last > it can find.
        If you want non greedy matching then you can use ? after the quantifier e.g.
        .+?
        but support for that is only in ECMAScript edition 3 compatible
        implementations , with older browsers such a construct is likely to not
        give the desired result.
        There are workarounds such as
        /<([^>]+)>/
        --

        Martin Honnen

        Comment

        • Befuddled

          #5
          Re: Regular Expressions Difficulty

          Martin Honnen <mahotrash@yaho o.de> wrote in
          news:41bb0730$0 $16044$9b4e6d93 @newsread4.arco r-online.net:
          [color=blue]
          >
          >
          > Befuddled wrote:
          >[color=green]
          >> Martin Honnen <mahotrash@yaho o.de> wrote[/color]
          >[color=green][color=darkred]
          >>>HTML browsers have HTML parsing built in so why do you neeed regular
          >>>expression s to parse HTML, why don't you simply create an element,
          >>>set its innerHTML to the HTML snippet and then read out the child
          >>>nodes as needed:
          >>> var div = document.create Element('div');
          >>> div.innerHTML = htmlString;[/color]
          >>
          >>
          >> I was avoiding the property 'innerHTML' because I did not know if it
          >> was standardized in DOM at any level. I am ABSOLUTELY avoiding the
          >> use of extensions beyond the standard (or more modestly put forth as
          >> a "recommendation ")[/color]
          >
          > So you would prefer createDocumentF ragment for instance to innerHTML
          > because createDocumentF ragment is in the W3C recommendation but
          > innerHTML is not? For istance IE 5.5 doesn't support
          > createDocumentF ragment so your code will not work there. innerHTML
          > certainly has far greater support than createDocumentF ragment.[/color]

          You're right. I was hasty in my explanation of adhering to the standard.
          I should have said that while my first duty is to the standard and to get
          its code in place, after writing its code, I attempt to include browser-
          dependent code, where possible, to accomodate browsers that don't happen
          to understand the standard. Sorry for being misleading, sounding
          impractical, and standing too adamantly.
          [color=blue]
          > But anyway, as for your regular expression problem, matching by
          > default is greedy meaning as much as possible is matched so your
          > expression correctyly consumes characters to the last > it can find.[/color]

          I suppose there was a good reason why the original developers of regular
          expressions wanted them to consume as much text as possible in matching
          criteria, rather than grabbing what was minimal (working from left to
          right, rather than right to left). I would love to know their reasoning.
          [color=blue]
          > If you want non greedy matching then you can use ? after the
          > quantifier e.g.
          > .+?
          > but support for that is only in ECMAScript edition 3 compatible
          > implementations , with older browsers such a construct is likely to not
          > give the desired result.
          > There are workarounds such as
          > /<([^>]+)>/[/color]

          Your solution appears to be working nicely. Thanks for all your good
          information.


          --


          Comment

          Working...