Benchmarks

Collapse
This topic is closed.
X
X
 
  • Time
  • Show
Clear All
new posts
  • s0suk3@gmail.com

    Benchmarks

    The task: Write a program that reads a set of words from standard
    input and prints the number of distinct words.

    I came across a website that listed a few programs to accomplish this
    task: http://unthought.net/c++/c_vs_c++.html (ignore all the language
    flaming :-), and thought that all of them did unnecessary operations,
    so I wrote my own. But for some reason, my version turned out slower
    that ALL of the versions in the website, even though it seems to
    perform less operations (yes, I benchmarked them on my own computer).

    According to the website, the slowest version is:

    #include <set>
    #include <string>
    #include <iostream>

    int main(int argc, char **argv)
    {
    // Declare and Initialize some variables
    std::string word;
    std::set<std::s tringwordcount;
    // Read words and insert in rb-tree
    while (std::cin >word) wordcount.inser t(word);
    // Print the result
    std::cout << "Words: " << wordcount.size( ) << std::endl;
    return 0;
    }

    My version is about 12 times slower than that. It uses lower-level
    constructs. Here it is:

    #include <stdio.h>
    #include <stdlib.h>
    #include <string.h>
    #include <ctype.h>

    struct SetNode
    {
    char *word;
    struct SetNode *next;
    };

    // An unorderd set of words
    //
    static struct SetNode *gSet = 0;
    static int gSetSize = 0;

    #define kInitWordSize 32

    // Returns a word read from stdin. The returned pointer must be
    // deallocated with free().
    //
    static char *
    ReadOneWord(voi d)
    {
    int ch = getchar();

    while (ch != EOF && isspace(ch))
    ch = getchar();
    if (ch == EOF)
    return 0;

    char *word = (char *) malloc(kInitWor dSize);
    if (!word)
    return 0;

    int size = kInitWordSize;
    int i = 0;

    while (ch != EOF && !isspace(ch)) {
    if (i >= size) {
    size *= 2;

    char *newWord = (char *) realloc(word, size);
    if (!newWord) {
    free(word);
    return 0;
    }
    word = newWord;
    }

    word[i++] = ch;
    ch = getchar();
    }

    if (i >= size) {
    size *= 2;

    char *newWord = (char *) realloc(word, size);
    if (!newWord) {
    free(word);
    return 0;
    }
    word = newWord;
    }
    word[i] = '\0';

    return word;
    }

    // Inserts a word into the set if it isn't in the set.
    // The passed string is expected to have been allocated with
    // a memory allocation function, and it should be considered
    // lost after passed to this function.
    //
    static void
    InsertWord(char *aWord)
    {
    struct SetNode *node;

    for (node = gSet; node; node = node->next) {
    if (strcmp(node->word, aWord) == 0) {
    free(aWord);
    return;
    }
    }

    node = (struct SetNode *) malloc(sizeof(s truct SetNode));
    if (!node) {
    free(aWord);
    return;
    }

    node->word = aWord;
    node->next = gSet;
    gSet = node;
    ++gSetSize;
    }

    static void
    DeleteSet(void)
    {
    struct SetNode *node = gSet;
    struct SetNode *temp;

    while (node) {
    temp = node;
    node = node->next;
    free(temp->word);
    free(temp);
    }

    gSet = 0;
    gSetSize = 0;
    }

    int
    main(void)
    {
    char *word;

    while ((word = ReadOneWord()))
    InsertWord(word );

    printf("Words: %d\n", gSetSize);

    // Skip cleanup for now...
    //DeleteSet();
    }

    Any ideas as to what causes the big slowdown?

    Sebastian

  • Kai-Uwe Bux

    #2
    Re: Benchmarks

    s0suk3@gmail.co m wrote:
    The task: Write a program that reads a set of words from standard
    input and prints the number of distinct words.
    >
    I came across a website that listed a few programs to accomplish this
    task: http://unthought.net/c++/c_vs_c++.html (ignore all the language
    flaming :-), and thought that all of them did unnecessary operations,
    so I wrote my own. But for some reason, my version turned out slower
    that ALL of the versions in the website, even though it seems to
    perform less operations (yes, I benchmarked them on my own computer).
    >
    According to the website, the slowest version is:
    >
    #include <set>
    #include <string>
    #include <iostream>
    >
    int main(int argc, char **argv)
    {
    // Declare and Initialize some variables
    std::string word;
    std::set<std::s tringwordcount;
    // Read words and insert in rb-tree
    while (std::cin >word) wordcount.inser t(word);
    // Print the result
    std::cout << "Words: " << wordcount.size( ) << std::endl;
    return 0;
    }
    >
    My version is about 12 times slower than that. It uses lower-level
    constructs. Here it is:
    >
    #include <stdio.h>
    #include <stdlib.h>
    #include <string.h>
    #include <ctype.h>
    >
    struct SetNode
    {
    char *word;
    struct SetNode *next;
    };
    This is a linear list.

    // An unorderd set of words
    //
    static struct SetNode *gSet = 0;
    static int gSetSize = 0;
    >
    #define kInitWordSize 32
    >
    // Returns a word read from stdin. The returned pointer must be
    // deallocated with free().
    //
    static char *
    ReadOneWord(voi d)
    {
    int ch = getchar();
    >
    while (ch != EOF && isspace(ch))
    ch = getchar();
    if (ch == EOF)
    return 0;
    >
    char *word = (char *) malloc(kInitWor dSize);
    if (!word)
    return 0;
    >
    int size = kInitWordSize;
    int i = 0;
    >
    while (ch != EOF && !isspace(ch)) {
    if (i >= size) {
    size *= 2;
    >
    char *newWord = (char *) realloc(word, size);
    if (!newWord) {
    free(word);
    return 0;
    }
    word = newWord;
    }
    >
    word[i++] = ch;
    ch = getchar();
    }
    >
    if (i >= size) {
    size *= 2;
    >
    char *newWord = (char *) realloc(word, size);
    if (!newWord) {
    free(word);
    return 0;
    }
    word = newWord;
    }
    word[i] = '\0';
    >
    return word;
    }
    >
    // Inserts a word into the set if it isn't in the set.
    // The passed string is expected to have been allocated with
    // a memory allocation function, and it should be considered
    // lost after passed to this function.
    //
    static void
    InsertWord(char *aWord)
    {
    struct SetNode *node;
    >
    for (node = gSet; node; node = node->next) {
    if (strcmp(node->word, aWord) == 0) {
    free(aWord);
    return;
    }
    }
    Here, you do a linear search.

    std::set<mainta ins a (balanced) tree internally and therefore does fewer
    comparisons per word (logarithmic vs. linear).

    node = (struct SetNode *) malloc(sizeof(s truct SetNode));
    if (!node) {
    free(aWord);
    return;
    }
    >
    node->word = aWord;
    node->next = gSet;
    gSet = node;
    ++gSetSize;
    }
    >
    static void
    DeleteSet(void)
    {
    struct SetNode *node = gSet;
    struct SetNode *temp;
    >
    while (node) {
    temp = node;
    node = node->next;
    free(temp->word);
    free(temp);
    }
    >
    gSet = 0;
    gSetSize = 0;
    }
    >
    int
    main(void)
    {
    char *word;
    >
    while ((word = ReadOneWord()))
    InsertWord(word );
    >
    printf("Words: %d\n", gSetSize);
    >
    // Skip cleanup for now...
    //DeleteSet();
    }
    >
    Any ideas as to what causes the big slowdown?
    Choice of a sub-optimal data structure.


    Best

    Kai-Uwe Bux

    Comment

    • Obnoxious User

      #3
      Re: Benchmarks

      On Thu, 06 Nov 2008 07:53:18 -0800, s0suk3 wrote:
      The task: Write a program that reads a set of words from standard input
      and prints the number of distinct words.
      >
      I came across a website that listed a few programs to accomplish this
      task: http://unthought.net/c++/c_vs_c++.html (ignore all the language
      flaming :-), and thought that all of them did unnecessary operations, so
      I wrote my own. But for some reason, my version turned out slower that
      ALL of the versions in the website, even though it seems to perform less
      operations (yes, I benchmarked them on my own computer).
      >
      According to the website, the slowest version is:
      >
      #include <set>
      #include <string>
      #include <iostream>
      >
      int main(int argc, char **argv)
      {
      // Declare and Initialize some variables std::string word;
      std::set<std::s tringwordcount;
      // Read words and insert in rb-tree
      while (std::cin >word) wordcount.inser t(word); // Print the
      result
      std::cout << "Words: " << wordcount.size( ) << std::endl; return
      0;
      }
      >
      My version is about 12 times slower than that. It uses lower-level
      constructs. Here it is:
      >
      [snip]

      So, since your version uses "lower-level constructs" you assume
      it would be automatically faster?

      --
      OU
      Remember 18th of June 2008, Democracy died that afternoon.

      Comment

      • Keith Thompson

        #4
        Re: Benchmarks

        s0suk3@gmail.co m writes:
        The task: Write a program that reads a set of words from standard
        input and prints the number of distinct words.
        >
        I came across a website that listed a few programs to accomplish this
        task: http://unthought.net/c++/c_vs_c++.html (ignore all the language
        flaming :-), and thought that all of them did unnecessary operations,
        so I wrote my own. But for some reason, my version turned out slower
        that ALL of the versions in the website, even though it seems to
        perform less operations (yes, I benchmarked them on my own computer).
        >
        According to the website, the slowest version is:
        >
        #include <set>
        #include <string>
        #include <iostream>
        >
        int main(int argc, char **argv)
        {
        // Declare and Initialize some variables
        std::string word;
        std::set<std::s tringwordcount;
        // Read words and insert in rb-tree
        while (std::cin >word) wordcount.inser t(word);
        // Print the result
        std::cout << "Words: " << wordcount.size( ) << std::endl;
        return 0;
        }
        >
        My version is about 12 times slower than that. It uses lower-level
        constructs. Here it is:
        >
        [snip]
        // Inserts a word into the set if it isn't in the set.
        // The passed string is expected to have been allocated with
        // a memory allocation function, and it should be considered
        // lost after passed to this function.
        //
        static void
        InsertWord(char *aWord)
        {
        struct SetNode *node;
        >
        for (node = gSet; node; node = node->next) {
        if (strcmp(node->word, aWord) == 0) {
        free(aWord);
        return;
        }
        }
        You represent your set of words as a linked list. You compare each
        new word to every word already in the set. The C++ solution uses a
        std::set which, if I recall correctly, can do searches and insertions
        in O(n log n).

        If you re-write this to use a balanced binary tree, such as an AVL
        tree, you should get performance similar to the C++ version.
        node = (struct SetNode *) malloc(sizeof(s truct SetNode));
        Not incorrect, but
        node = malloc(sizeof *node);
        would be better.
        if (!node) {
        free(aWord);
        return;
        }
        And if malloc fails, you quietly return without doing anything to
        handle the error or report it to the user.

        [...]

        --
        Keith Thompson (The_Other_Keit h) kst-u@mib.org <http://www.ghoti.net/~kst>
        Nokia
        "We must do something. This is something. Therefore, we must do this."
        -- Antony Jay and Jonathan Lynn, "Yes Minister"

        Comment

        • Longjun

          #5
          Re: Benchmarks

          On Nov 6, 11:53 pm, s0s...@gmail.co m wrote:
          The task: Write a program that reads a set of words from standard
          input and prints the number of distinct words.
          >
          I came across a website that listed a few programs to accomplish this
          task:http://unthought.net/c++/c_vs_c++.html(ignore all the language
          flaming :-), and thought that all of them did unnecessary operations,
          so I wrote my own. But for some reason, my version turned out slower
          that ALL of the versions in the website, even though it seems to
          perform less operations (yes, I benchmarked them on my own computer).
          >
          According to the website, the slowest version is:
          >
          #include <set>
          #include <string>
          #include <iostream>
          >
          int main(int argc, char **argv)
          {
                  // Declare and Initialize some variables
                  std::string word;
                  std::set<std::s tringwordcount;
                  // Read words and insert in rb-tree
                  while (std::cin >word) wordcount.inser t(word);
                  // Print the result
                  std::cout << "Words: " << wordcount.size( ) << std::endl;
                  return 0;
          >
          }
          >
          My version is about 12 times slower than that. It uses lower-level
          constructs. Here it is:
          >
          #include <stdio.h>
          #include <stdlib.h>
          #include <string.h>
          #include <ctype.h>
          >
          struct SetNode
          {
              char *word;
              struct SetNode *next;
          >
          };
          >
          // An unorderd set of words
          //
          static struct SetNode *gSet = 0;
          static int gSetSize = 0;
          >
          #define kInitWordSize 32
          >
          // Returns a word read from stdin. The returned pointer must be
          // deallocated with free().
          //
          static char *
          ReadOneWord(voi d)
          {
              int ch = getchar();
          >
              while (ch != EOF && isspace(ch))
                  ch = getchar();
              if (ch == EOF)
                  return 0;
          >
              char *word = (char *) malloc(kInitWor dSize);
              if (!word)
                  return 0;
          >
              int size = kInitWordSize;
              int i = 0;
          >
              while (ch != EOF && !isspace(ch)) {
                  if (i >= size) {
                      size *= 2;
          >
                      char *newWord = (char *) realloc(word, size);
                      if (!newWord) {
                          free(word);
                          return 0;
                      }
                      word = newWord;
                  }
          >
                  word[i++] = ch;
                  ch = getchar();
              }
          >
              if (i >= size) {
                  size *= 2;
          >
                  char *newWord = (char *) realloc(word, size);
                  if (!newWord) {
                      free(word);
                      return 0;
                  }
                  word = newWord;
              }
              word[i] = '\0';
          >
              return word;
          >
          }
          >
          // Inserts a word into the set if it isn't in the set.
          // The passed string is expected to have been allocated with
          // a memory allocation function, and it should be considered
          // lost after passed to this function.
          //
          static void
          InsertWord(char *aWord)
          {
              struct SetNode *node;
          >
              for (node = gSet; node; node = node->next) {
                  if (strcmp(node->word, aWord) == 0) {
                      free(aWord);
                      return;
                  }
              }
          >
              node = (struct SetNode *) malloc(sizeof(s truct SetNode));
              if (!node) {
                  free(aWord);
                  return;
              }
          >
              node->word = aWord;
              node->next = gSet;
              gSet = node;
              ++gSetSize;
          >
          }
          >
          static void
          DeleteSet(void)
          {
              struct SetNode *node = gSet;
              struct SetNode *temp;
          >
              while (node) {
                  temp = node;
                  node = node->next;
                  free(temp->word);
                  free(temp);
              }
          >
              gSet = 0;
              gSetSize = 0;
          >
          }
          >
          int
          main(void)
          {
              char *word;
          >
              while ((word = ReadOneWord()))
                  InsertWord(word );
          >
              printf("Words: %d\n", gSetSize);
          >
              // Skip cleanup for now...
              //DeleteSet();
          >
          }
          >
          Any ideas as to what causes the big slowdown?
          >
          Sebastian
          Noticed that you've implemented your own mechanism of scanning words
          from standard input and insert a new elements in your "sets". I don't
          know why you implement it by yourself. Are you clear the principle of
          the class cin/cout and set? Are you sure that your own function have a
          better performance to the standard one?
          I'm interested with how to test your application performance. Can you
          tell me? I suggest you compare the performance difference between your
          function between the standard one step by step.

          Thanks

          Comment

          • Juha Nieminen

            #6
            Re: Benchmarks

            Keith Thompson wrote:
            You represent your set of words as a linked list. You compare each
            new word to every word already in the set. The C++ solution uses a
            std::set which, if I recall correctly, can do searches and insertions
            in O(n log n).
            That would be worse than linear-time, and is of course false.

            You meant: O(lg n).

            Comment

            • Juha Nieminen

              #7
              Re: Benchmarks

              s0suk3@gmail.co m wrote:
              Any ideas as to what causes the big slowdown?
              Why do you expect that searching a linked list could be even close to
              the speed of searching a balanced binary tree?

              Comment

              • s0suk3@gmail.com

                #8
                Re: Benchmarks

                On Nov 6, 11:22 am, Obnoxious User <OU@127.0.0.1wr ote:
                On Thu, 06 Nov 2008 07:53:18 -0800, s0suk3 wrote:
                The task: Write a program that reads a set of words from standard input
                and prints the number of distinct words.
                >
                I came across a website that listed a few programs to accomplish this
                task:http://unthought.net/c++/c_vs_c++.html(ignore all the language
                flaming :-), and thought that all of them did unnecessary operations, so
                I wrote my own. But for some reason, my version turned out slower that
                ALL of the versions in the website, even though it seems to perform less
                operations (yes, I benchmarked them on my own computer).
                >
                According to the website, the slowest version is:
                >
                #include <set>
                #include <string>
                #include <iostream>
                >
                int main(int argc, char **argv)
                {
                        // Declare and Initialize some variables std::string word;
                        std::set<std::s tringwordcount;
                        // Read words and insert in rb-tree
                        while (std::cin >word) wordcount.inser t(word); // Print the
                        result
                        std::cout << "Words: " << wordcount.size( ) << std::endl; return
                        0;
                }
                >
                My version is about 12 times slower than that. It uses lower-level
                constructs. Here it is:
                >
                [snip]
                >
                So, since your version uses "lower-level constructs" you assume
                it would be automatically faster?
                Well, in this case it did seem to have a couple of advantages, like
                for example, InsertWord() is given a pointer directly to the malloc()-
                allocated string, which it simply copies into the .word member of the
                struct; it doesn't need to allocate new memory and copy the string
                from one place to the other, whereas std::set::inser t() does need to
                do make a copy.

                Also, I'm not sure, but is the set's destructor invoked when main()
                goes out of scope (causing memory cleanup)? (Currently the other
                version has the DeleteSet() call commented-out.)

                But, as others have pointed out, it's clear that the reason for the
                difference in performance is the searching mechanism.

                Sebastian

                Comment

                • Pawel Dziepak

                  #9
                  Re: Benchmarks

                  -----BEGIN PGP SIGNED MESSAGE-----
                  Hash: SHA1

                  s0suk3@gmail.co m wrote:
                  Well, in this case it did seem to have a couple of advantages, like
                  for example, InsertWord() is given a pointer directly to the malloc()-
                  allocated string, which it simply copies into the .word member of the
                  struct; it doesn't need to allocate new memory and copy the string
                  from one place to the other, whereas std::set::inser t() does need to
                  do make a copy.
                  >
                  Also, I'm not sure, but is the set's destructor invoked when main()
                  goes out of scope (causing memory cleanup)? (Currently the other
                  version has the DeleteSet() call commented-out.)
                  >
                  But, as others have pointed out, it's clear that the reason for the
                  difference in performance is the searching mechanism.
                  Indeed it is. All that low-level stuff is faster, but it means nothing
                  when the problem is an algorithm and/or used data structure.

                  Pawel Dziepak
                  -----BEGIN PGP SIGNATURE-----
                  Version: GnuPG v1.4.9 (GNU/Linux)
                  Comment: Using GnuPG with Fedora - http://enigmail.mozdev.org

                  iEYEARECAAYFAkk TNCAACgkQPFW+cU iIHNo9IQCgr6NC7 6/yXnouBhUYLn3fx3 Rn
                  2HQAn3h19yS62OZ qMv+jo0wSsr6M57 6O
                  =WTrr
                  -----END PGP SIGNATURE-----

                  Comment

                  • Keith Thompson

                    #10
                    Re: Benchmarks

                    Juha Nieminen <nospam@thanks. invalidwrites:
                    Keith Thompson wrote:
                    >You represent your set of words as a linked list. You compare each
                    >new word to every word already in the set. The C++ solution uses a
                    >std::set which, if I recall correctly, can do searches and insertions
                    >in O(n log n).
                    >
                    That would be worse than linear-time, and is of course false.
                    >
                    You meant: O(lg n).
                    Oops, you're correct of course; thanks for the correction.

                    Actually I meant O(log n), but it's the same thing.

                    --
                    Keith Thompson (The_Other_Keit h) kst-u@mib.org <http://www.ghoti.net/~kst>
                    Nokia
                    "We must do something. This is something. Therefore, we must do this."
                    -- Antony Jay and Jonathan Lynn, "Yes Minister"

                    Comment

                    • Keith Thompson

                      #11
                      Re: Benchmarks

                      Longjun <Longjun.Qiong@ gmail.comwrites :
                      On Nov 6, 11:53 pm, s0s...@gmail.co m wrote:
                      >The task: Write a program that reads a set of words from standard
                      >input and prints the number of distinct words.
                      >>
                      >I came across a website that listed a few programs to accomplish this
                      >task:http://unthought.net/c++/c_vs_c++.html(ignore all the language
                      >flaming :-), and thought that all of them did unnecessary operations,
                      >so I wrote my own. But for some reason, my version turned out slower
                      >that ALL of the versions in the website, even though it seems to
                      >perform less operations (yes, I benchmarked them on my own computer).
                      >>
                      >According to the website, the slowest version is:
                      >>
                      >#include <set>
                      >#include <string>
                      >#include <iostream>
                      [...]
                      >>
                      >My version is about 12 times slower than that. It uses lower-level
                      >constructs. Here it is:
                      >>
                      >#include <stdio.h>
                      >#include <stdlib.h>
                      >#include <string.h>
                      >#include <ctype.h>
                      [...]
                      >>
                      >Any ideas as to what causes the big slowdown?
                      >
                      Noticed that you've implemented your own mechanism of scanning words
                      from standard input and insert a new elements in your "sets". I don't
                      know why you implement it by yourself. Are you clear the principle of
                      the class cin/cout and set? Are you sure that your own function have a
                      better performance to the standard one?
                      This discussion is cross-posted to comp.lang.c and comp.lang.c++. His
                      solution is written in C, which doesn't have sets built into the
                      standard library.

                      But certainly the equivalent functionality could be written in C.

                      --
                      Keith Thompson (The_Other_Keit h) kst-u@mib.org <http://www.ghoti.net/~kst>
                      Nokia
                      "We must do something. This is something. Therefore, we must do this."
                      -- Antony Jay and Jonathan Lynn, "Yes Minister"

                      Comment

                      • user923005

                        #12
                        Re: Benchmarks

                        On Nov 6, 7:53 am, s0s...@gmail.co m wrote:
                        The task: Write a program that reads a set of words from standard
                        input and prints the number of distinct words.
                        [snip]

                        I am surprised chuck has not chimed in here.
                        A hash table is *ideal* for this.

                        P.S. to those analyzing performance...
                        Since we have to examine every word in the list, the performance of
                        the algorithm *cannot* be faster than O(n).
                        The hash table solution is O(n).

                        Using a btree, skiplist, avltree, etc. will be O(n log n) because:
                        For each word, we must collect it.
                        For this word, we must check for duplicity. With a hash table the
                        check is O(1). With a logarithmic search structure, the check is
                        O(log n). (There is a multiplicative constant less than 1, but that
                        does not alter the O(log n) behavior.
                        Hence: O(log n) * O(n) = O(n log n) for most ordered list variants.

                        There is another structure that would be competitive. I guess that a
                        ternary search tree might beat a hash table just because of the
                        excellence in memory access pattern. At least for lists of less than
                        a million items (and it would be hard to come up with more than a
                        million correctly spelled real words).
                        Robert Sedgewick is the founding chair and the William O. Baker Professor in the Department of Computer Science at Princeton University.


                        Comment

                        • Juha Nieminen

                          #13
                          Re: Benchmarks

                          user923005 wrote:
                          The hash table solution is O(n).
                          You would have hard time proving that.

                          Comment

                          • user923005

                            #14
                            Re: Benchmarks

                            On Nov 6, 1:21 pm, Juha Nieminen <nos...@thanks. invalidwrote:
                            user923005 wrote:
                            The hash table solution is O(n).
                            >
                              You would have hard time proving that.
                            Cuckoo hashing has guaranteed O(1) lookup and delete and amortized
                            O(1) insert.

                            My solution would also do the counting at insert time (IOW, the hash
                            insert function will return 0 if the item is already in the table and
                            return 1 if the item was inserted).
                            In that way, there is no need to scan the table and you can make it as
                            large as you like.

                            IOW:

                            unsigned long long count = 0;

                            while (item = fgets(string, sizeof string, stdin))
                            count += cuckoo_hash_ins ert(item);

                            Comment

                            • Paul Hsieh

                              #15
                              Re: Benchmarks

                              On Nov 6, 1:51 pm, c...@tiac.net (Richard Harter) wrote:
                              On Thu, 6 Nov 2008 12:53:55 -0800 (PST), user923005
                              <dcor...@connx. comwrote:
                              On Nov 6, 7:53=A0am, s0s...@gmail.co m wrote:
                              The task: Write a program that reads a set of words from standard
                              input and prints the number of distinct words.
                              [snip]
                              >
                              [...] Using a btree, skiplist, avltree, etc. will be O(n log n) because:[...]
                              >
                              hash table      - O(log n)
                              comparison tree - O((log n)^2)
                              radix trees     - O(log n)
                              >
                              [etc]
                              I don't have any idea what anyone here is talking about. This is
                              clearly a "trie" problem. The performance is O(n), where n is the
                              length of the input (in characters). If your performance is any
                              different from that your implementation is just wrong. Here is my
                              solution, which is simpler/shorter than anything given so far or on
                              that website:

                              #include <stdio.h>
                              #include <stdlib.h>
                              #include <string.h>

                              struct trieNode {
                              int canTerminateHer e;
                              struct trieNode * letter[26];
                              };

                              static int * insertTrie (struct trieNode * tn, const char * w) {
                              if ('\0' == *w) return &tn->canTerminateHe re;
                              if (NULL == tn->letter[*w-'a']) {
                              if (NULL == (tn->letter[*w-'a'] = (struct trieNode *) calloc
                              (1, sizeof (struct trieNode)))) return NULL;
                              }
                              return insertTrie (tn->letter[*w-'a'], w+1);
                              }

                              int main () {
                              struct trieNode start = {0};
                              char buff[2048], *s, *t;
                              int count = 0, *ret;
                              while (buff == fgets (buff, 2048, stdin)) {
                              for (t = buff; *t;) {
                              s = t + strspn (t, "abcdefghijklmn opqrstuvwxyz");
                              if (s != t) {
                              char c = *s;
                              *s = '\0';
                              if (NULL == (ret = insertTrie (&start, t))) exit (-1);
                              *s = c;
                              count += 1 ^ *ret;
                              *ret = 1;
                              }
                              t = s + strcspn (s, "abcdefghijklmn opqrstuvwxyz");
                              }
                              }
                              printf ("Count: %d\n", count);
                              return 0;
                              }

                              This makes the assumption that all inputs are continguous words in
                              lines no longer than 2048 characters separated by white space or line
                              feeds or whatever. It also assumes that you have enough memory to
                              hold all the words input, at a rate of (26 * sizeof (void *) + sizeof
                              (int)) * (size of the dictionary of the input in characters), roughly
                              speaking. The program doesn't clean up after itself, but I don't
                              think that was in the requirements.

                              As for the *real* performance of this thing, it will come down to the
                              calloc() speed. I could waste a lot more lines of code to massively
                              improve the performance here. It might be worth it if, say, the
                              benchmark were trying to measure performance per line of code. So the
                              score would be, say, lines of code times time taken to output the
                              correct count or something, and it would then probably be worth it to
                              implement a calloc pooler (it would double the size at least, but
                              probably make the performance at least 3 times higher, if the words
                              had a high enough uniqueness rate).

                              --
                              Paul Hsieh
                              Pobox has been discontinued as a separate service, and all existing customers moved to the Fastmail platform.


                              Comment

                              Working...