form crawler

Collapse
X
 
  • Time
  • Show
Clear All
new posts
  • bernouli
    New Member
    • Oct 2012
    • 26

    form crawler

    This code crawls only the url of the domain of a site eg www.example.com/orange-is-good.html
    and will display word on the url based on searched word.Now how do i make it to crawl for keywords, description, title,body content of pages

    images via title, alt,form headers(eg h1 to h7), anchor links etc.


    Code:
    
    <!DOCTYPE HTML PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
    <html xmlns="http://www.w3.org/1999/xhtml"><head>
    
    
    
    
    <meta http-equiv="Content-Type" content="text/html; charset=ISO-8859-1">
    
    <title>Orange is good</title>
    
    <meta name="Description" content="orange is good for health">
    <meta name="Keywords" content="ripe orange,fresh orange,good orange">
    
    
    
    </head>
    
    <body> 
    <a href="http://www.example.com/orange-is-good.html"><h1>Orange is Good</h1></a><br>
    
     Orange is the best among all the fruits in the world. Orange has been in place since 2013.
    
    img src="orange.jpg" title="orange site" alt="orange is a good fruit." width=200 height=200>
        
    
    </body>



    www.example.com/orange-is-good.html= ADD DOMAIN NAME HERE


    Code:
    <?php
    
    session_start();
    
    $domain = "ADD DOMAIN NAME HERE";
    
    if(empty($_SESSION['page']))
    {
    $original_file = file_get_contents("http://" . $domain . "/");
    
    $_SESSION['i'] = 0;
    
    $connect = mysql_connect("HOST","USERNAME","PASSWORD");
    
    if (!$connect)
    {
    die("MySQL could not connect!");
    }
    
    $DB = mysql_select_db('DATABASE NAME');
    
    if(!$DB)
    {
    die("MySQL could not select Database!");
    }
    }
    if(isset($_SESSION['page']))
    {
    
    $connect = mysql_connect("HOST","USERNAME","PASSWORD");
    
    if (!$connect)
    {
    die("MySQL could not connect!");
    }
    
    $DB = mysql_select_db('DATABASE NAME');
    
    if(!$DB)
    {
    die("MySQL could not select Database!");
    }
    $PAGE = $_SESSION['page'];
    $original_file = file_get_contents("$PAGE");
    }
    
    $stripped_file = strip_tags($original_file, "<a>");
    preg_match_all("/<a(?:[^>]*)href=\"([^\"]*)\"(?:[^>]*)>(?:[^<]*)<\/a>/is", $stripped_file, $matches);
    
    foreach($matches[1] as $key => $value)
    {
    
    if(strpos($value,"http://") != 'FALSE' && strpos($value,"https://") != 'FALSE')
    {
    $New_URL = "http://" . $domain . $value;
    }
    else
    {
    $New_URL = $value;
    }
    $New_URL = addslashes($New_URL);
    $Check = mysql_query("SELECT * FROM pages WHERE url='$New_URL'");
    $Num = mysql_num_rows($Check);
    
    if($Num == 0)
    {
    mysql_query("INSERT INTO pages (url)
    VALUES ('$New_URL')");
    
    $_SESSION['i']++;
    
    echo $_SESSION['i'] . "";
    }
    echo mysql_error();
    }
    
    $RandQuery = mysql_query("SELECT * FROM pages ORDER BY RAND() LIMIT 0,1");
    $RandReturn = mysql_num_rows($RandQuery);
    while($row1 = mysql_fetch_assoc($RandQuery))
    {
    $_SESSION['page'] = $row1['url'];
    }
    echo $RandReturn;
    echo $_SESSION['page'];
    mysql_close();
    header("refresh: 0;");
    
    ?>
Working...