/    Sign up×
Community /Pin to ProfileBookmark

Let Us Finish Spider Number 7 Together

Php Folks,

I found 9 we crawler php codes online. Some are incomplete. Let’s fix them together.
On this thread, we work on spider number 7 on my list.

TEST RESULT: MANAGES TO EXTRACT TITLE OF initial page and submit to db.
BUT FAILS to extract ‘titles’ of spidered pages.
But lists spidered page’s ‘url paths’ to db.

WORK WE NEED TO DO:
We need to add lines for it to:

  • * extract page titles

  • * add to db spidered pages’ titles.
  • ““
    <?php
    //https://potentpages.com/web-crawler-development/tutorials/php/creating-a-simple-php-website-crawler

    /* TEST RESULT: MANAGES TO EXTRACT TITLE OF initial page and submit to db.
    BUT NOT extract titles of spidered pages.
    But lists spidered page’s url paths to db.
    */

    ini_set(‘display_errors’, true);
    error_reporting(E_ALL);

    $mysql_host = ‘localhost’;
    $mysql_username = ‘root’;
    $mysql_password = ”;
    $mysql_database = ‘test’;
    $mysql_conn = mysqli_connect( $mysql_host, $mysql_username, $mysql_password, $mysql_database );
    if ( !$mysql_conn ) {
    echo “Error: Unable to connect to MySQL.” . PHP_EOL;
    echo “Debugging errno: ” . mysqli_connect_errno() . PHP_EOL;
    echo “Debugging error: ” . mysqli_connect_error() . PHP_EOL;
    exit;
    }
    /**
    * Download a Webpage via the HTTP GET Protocol using libcurl
    * Developed By: Potent Pages, LLC (https://potentpages.com/)
    */
    function _http ( $target, $referer ) {
    //Initialize Handle
    $handle = curl_init();
    //Define Settings
    curl_setopt ( $handle, CURLOPT_HTTPGET, true );
    curl_setopt ( $handle, CURLOPT_HEADER, true );
    curl_setopt ( $handle, CURLOPT_COOKIEJAR, “cookie_jar.txt” );
    curl_setopt ( $handle, CURLOPT_COOKIEFILE, “cookies.txt” );
    curl_setopt ( $handle, CURLOPT_USERAGENT, “web-crawler-tutorial-test” );
    curl_setopt ( $handle, CURLOPT_URL, $target );
    curl_setopt ( $handle, CURLOPT_REFERER, $referer );
    curl_setopt ( $handle, CURLOPT_FOLLOWLOCATION, true );
    curl_setopt ( $handle, CURLOPT_MAXREDIRS, 4 );
    curl_setopt ( $handle, CURLOPT_RETURNTRANSFER, true );
    //Execute Request
    $output = curl_exec ( $handle );
    //Close cURL handle
    curl_close ( $handle );
    //Separate Header and Body
    $separator = “rnrn”;
    $header = substr( $output, 0, strpos( $output, $separator ) );
    $body_start = strlen( $header ) + strlen( $separator );
    $body = substr( $output, $body_start, strlen( $output ) – $body_start );
    //Parse Headers
    $header_array = Array();
    foreach ( explode ( “rn”, $header ) as $i => $line ) {
    if($i === 0) {
    $header_array[‘http_code’] = $line;
    $status_info = explode( ” “, $line );
    $header_array[‘status_info’] = $status_info;
    } else {
    list ( $key, $value ) = explode ( ‘: ‘, $line );
    $header_array[$key] = $value;
    }
    }
    //Form Return Structure
    $ret = Array(“headers” => $header_array, “body” => $body );
    return $ret;
    }
    /**
    * Convert Relative to Absolute URL
    * Developed By: Potent Pages, LLC (https://potentpages.com/)
    * From: https://potentpages.com/web-crawler-development/tutorials/php/simple-php-web-spider
    * Based On: https://stackoverflow.com/questions/4444475/transfrom-relative-path-into-absolute-url-using-php
    */
    function relativeToAbsolute( $relative, $base )
    {
    if($relative == “” || $base == “”) return “”;
    //Check Base
    $base_parsed = parse_url($base);
    if( !array_key_exists( ‘scheme’, $base_parsed ) || !array_key_exists( ‘host’, $base_parsed ) || !array_key_exists( ‘path’, $base_parsed ) ) {
    echo “Base Path “$base” Not Absolute Linkn”;
    return “”;
    }
    //Parse Relative
    $relative_parsed = parse_url($relative);
    //If relative URL already has a scheme, it’s already absolute
    if( array_key_exists( ‘scheme’, $relative_parsed ) && $relative_parsed[‘scheme’] != ” ) {
    return $relative;
    }
    //If only a query or a fragment, return base (without any fragment or query) + relative
    if( !array_key_exists( ‘scheme’, $relative_parsed ) && !array_key_exists( ‘host’, $relative_parsed ) && !array_key_exists( ‘path’, $relative_parsed ) ) {
    return $base_parsed[‘scheme’]. ‘://’. $base_parsed[‘host’]. $base_parsed[‘path’]. $relative;
    }
    //Remove non-directory portion from path
    $path = preg_replace( ‘#/[^/]*$#’, ”, $base_parsed[‘path’] );
    //If relative path already points to root, remove base return absolute path
    if( $relative[0] == ‘/’ ) {
    $path = ”;
    }
    //Working Absolute URL
    $abs = ”;
    //If user in URL
    if( array_key_exists( ‘user’, $base_parsed ) ) {
    $abs .= $base_parsed[‘user’];
    //If password in URL as well
    if( array_key_exists( ‘pass’, $base_parsed ) ) {
    $abs .= ‘:’. $base_parsed[‘pass’];
    }
    //Append location prefix
    $abs .= ‘@’;
    }
    //Append Host
    $abs .= $base_parsed[‘host’];
    //If port in URL
    if( array_key_exists( ‘port’, $base_parsed ) ) {
    $abs .= ‘:’. $base_parsed[‘port’];
    }
    //Append New Relative Path
    $abs .= $path. ‘/’. $relative;
    //Replace any ‘//’ or ‘/./’ or ‘/foo/../’ with ‘/’
    $regex = array(‘#(/.?/)#’, ‘#/(?!..)[^/]+/../#’);
    for( $n=1; $n>0; $abs = preg_replace( $regex, ‘/’, $abs, -1, $n ) ) {}
    //Return Absolute URL
    return $base_parsed[‘scheme’]. ‘://’. $abs;
    }
    function parsePage( $target, $referer ) {
    global $mysql_conn;
    //Parse URL and get Components
    $url_components = parse_url( $target );
    if($url_components === false) {
    die( ‘Unable to Parse URL’ );
    }
    $url_host = $url_components[‘host’];
    $url_path = ”;
    if(array_key_exists( ‘path’, $url_components ) == false) {
    //If not a valid path, mark as done
    $query = “INSERT INTO pages (path, download_time) VALUES (“”. mysqli_real_escape_string( $mysql_conn, $target ). “”, NOW()) ON DUPLICATE KEY UPDATE download_time=NOW()”;
    if( !mysqli_query($mysql_conn, $query) ) {
    echo “Line 128”;//MY OWN ADDED LINE
    die( “Error: Unable to perform Download Time Update Query (path)n” );
    }
    return false;
    } else {
    $url_path = $url_components[‘path’];
    }
    //Download Page
    echo “Downloading: $targetn”;
    $contents = _http ( $target, $referer );
    echo “Donen”;
    //Check Status
    if( $contents[‘headers’][‘status_info’][1] != 200 ) {
    //If not ok, mark as downloaded but skip
    $query = “INSERT INTO pages (path, download_time) VALUES (“”. mysqli_real_escape_string( $mysql_conn, $url_path ). “”, NOW()) ON DUPLICATE KEY UPDATE download_time=NOW()”;
    if( !mysqli_query($mysql_conn, $query) ) {
    echo “Line 144”;//MY OWN ADDED LINE
    die( “Error: Unable to perform Download Time Update Query (http status)n” );
    }
    return false;
    }
    //Parse Contents
    $doc = new DOMDocument();
    libxml_use_internal_errors( true );
    $doc->loadHTML( $contents[‘body’] );
    //Get title
    $title = ”;
    $titleTags = $doc->getElementsByTagName(‘title’);
    if( count( $titleTags ) > 0 ) {
    $title = mysqli_real_escape_string( $mysql_conn, $titleTags[0]->nodeValue );
    }
    //Get Description
    $description = ”;
    $metaTags = $doc->getElementsByTagName(‘meta’);
    foreach( $metaTags as $tag ) {
    if( $tag->getAttribute(‘name’) == ‘description’ ) {
    $description = mysqli_real_escape_string( $mysql_conn, $tag->getAttribute( ‘content’ ) );
    }
    }
    //Get first h1
    $h1 = ”;
    $h1Tags = $doc->getElementsByTagName(‘h1’);
    if( count( $h1Tags ) > 0 ) {
    $h1 = mysqli_real_escape_string( $mysql_conn, $h1Tags[0]->nodeValue );
    }
    //Insert/Update Page Data
    $query = “INSERT INTO pages (path, title, description, h1, download_time) VALUES (“”. mysqli_real_escape_string( $mysql_conn, $url_path ). “”, “$title”, “$description”, “$h1″, NOW()) ON DUPLICATE KEY UPDATE title=”$title”, description=”$description”, h1=”$h1″, download_time=NOW()”;
    if( !mysqli_query($mysql_conn, $query) ) {
    echo “Line 185”;//MY OWN ADDED LINE
    die( “Error: Unable to perform Insert Queryn” );
    }
    //Get Links
    $links = Array();
    $link_tags = $doc->getElementsByTagName( ‘a’ );
    foreach( $link_tags as $tag ) {
    if( ($href_value = $tag->getAttribute( ‘href’ ))) {
    $link_absolute = relativeToAbsolute( $href_value, $target );
    $link_parsed = parse_url( $link_absolute );
    if($link_parsed === null || $link_parsed === false) {
    die( ‘Unable to Parse Link URL’ );
    }
    if(( !array_key_exists( ‘host’, $link_parsed ) || $link_parsed[‘host’] == “” || $link_parsed[‘host’] == $url_host ) && array_key_exists( ‘path’, $link_parsed ) && $link_parsed[‘path’] != “” && array_search( $link_parsed[‘path’], $links ) === false) {
    $links[] = $link_parsed[‘path’];
    }
    }
    }
    //Insert Links
    foreach($links as $link) {
    $link_escaped = mysqli_real_escape_string( $mysql_conn, $link );
    $query = “INSERT IGNORE INTO pages (path, referer, download_time) VALUES (“$link_escaped”, “”. mysqli_real_escape_string( $mysql_conn, $target ). “”, NULL)”;
    if( !mysqli_query($mysql_conn, $query) ) {
    echo “Line 199”;//MY OWN ADDED LINE
    die( “Error: Unable to perform Insert Link Value Queryn” );
    }
    }
    return true;
    }
    //Define Seed Settings
    $seed_url = “http://localhost/crawler/crawler/7/1.php”;
    $seed_components = parse_url( $seed_url );
    if($seed_components === false) {
    die( ‘Unable to Seed Parse URL’ );
    }
    $seed_scheme = $seed_components[‘scheme’];
    $seed_host = $seed_components[‘host’];
    $url_start = $seed_scheme. ‘://’. $seed_host;
    //Download Seed URL
    parsePage( $seed_url, “” );
    //Loop through all pages on site.
    while(1) {
    $counter = 0;
    $select_query = “SELECT * FROM pages WHERE download_time IS NULL”;
    if(($select_result = mysqli_query( $mysql_conn, $select_query )) !== false) {
    if( ($rowCount = mysqli_num_rows($select_result)) > 0 ) {
    for( $i = 0; $i < $rowCount; $i++ ) {
    if( ( $row = mysqli_fetch_assoc($select_result) ) !== false ) {
    $path = $row[‘path’];
    $referer = $row[‘referer’];
    //Check if first character isn’t a ‘/’
    if( $path[0] != ‘/’ ) {
    continue;
    }
    $path = $row[‘path’];
    $referer = $row[‘referer’];
    if( parsePage( $url_start. $path, $referer ) ) {
    $counter++;
    }
    sleep(1);
    }
    }
    } else {
    break;
    }
    } else {
    die( “Unable to select un-downloaded pagesn” );
    }
    if($counter == 0) {
    break;
    }
    }

    ?>
    ““

    to post a comment
    PHP

    0Be the first to comment 😎

    ×

    Success!

    Help @developer_web spread the word by sharing this article on Twitter...

    Tweet This
    Sign in
    Forgot password?
    Sign in with TwitchSign in with GithubCreate Account
    about: ({
    version: 0.1.9 BETA 5.24,
    whats_new: community page,
    up_next: more Davinci•003 tasks,
    coming_soon: events calendar,
    social: @webDeveloperHQ
    });

    legal: ({
    terms: of use,
    privacy: policy
    });
    changelog: (
    version: 0.1.9,
    notes: added community page

    version: 0.1.8,
    notes: added Davinci•003

    version: 0.1.7,
    notes: upvote answers to bounties

    version: 0.1.6,
    notes: article editor refresh
    )...
    recent_tips: (
    tipper: @AriseFacilitySolutions09,
    tipped: article
    amount: 1000 SATS,

    tipper: @Yussuf4331,
    tipped: article
    amount: 1000 SATS,

    tipper: @darkwebsites540,
    tipped: article
    amount: 10 SATS,
    )...