/    Sign up×
Community /Pin to ProfileBookmark

Let Us Add Some Features On Spider Number 8

Php Folks,

I found 9 we crawler php codes online. Some are incomplete. Let’s fix them together.
On this thread, we work on spider number 8 on my list.

TEST RESULT: DEPTH CRAWLING FAILS! ONLY ECHOES FOUND LINKS ON VISITED PAGE!

WORK WE NEED TO DO:
We need to add lines for it to:

  • * add depth crawling
  • ““
    <?php

    //https://stackoverflow.com/questions/2313107/how-do-i-make-a-simple-crawler-in-php
    //TEST RESULT: DEPTH CRAWLING FAILS! ONLY ECHOES FOUND LINKS ON VISITED PAGE!
    //Author: Wonderland.

    ini_set(‘display_errors’, true);
    error_reporting(E_ALL);

    class crawler
    {
    protected $_url;
    protected $_depth;
    protected $_host;
    protected $_useHttpAuth = false;
    protected $_user;
    protected $_pass;
    protected $_seen = array();
    protected $_filter = array();

    public function __construct($url, $depth = 50)
    {
    $this->_url = $url;
    $this->_depth = $depth;
    $parse = parse_url($url);
    $this->_host = $parse[‘host’];
    }

    protected function _processAnchors($content, $url, $depth)
    {
    $dom = new DOMDocument(‘1.0’);
    @$dom->loadHTML($content);
    $anchors = $dom->getElementsByTagName(‘a’);

    foreach ($anchors as $element) {
    $href = $element->getAttribute(‘href’);
    if (0 !== strpos($href, ‘http’)) {
    $path = ‘/’ . ltrim($href, ‘/’);
    if (extension_loaded(‘http’)) {
    $href = http_build_url($url, array(‘path’ => $path));
    } else {
    $parts = parse_url($url);
    $href = $parts[‘scheme’] . ‘://’;
    if (isset($parts[‘user’]) && isset($parts[‘pass’])) {
    $href .= $parts[‘user’] . ‘:’ . $parts[‘pass’] . ‘@’;
    }
    $href .= $parts[‘host’];
    if (isset($parts[‘port’])) {
    $href .= ‘:’ . $parts[‘port’];
    }
    $href .= $path;
    }
    }
    // Crawl only link that belongs to the start domain
    $this->crawl_page($href, $depth – 1);
    }
    }

    protected function _getContent($url)
    {
    $handle = curl_init($url);
    if ($this->_useHttpAuth) {
    curl_setopt($handle, CURLOPT_HTTPAUTH, CURLAUTH_ANY);
    curl_setopt($handle, CURLOPT_USERPWD, $this->_user . “:” . $this->_pass);
    }
    // follows 302 redirect, creates problem wiht authentication
    // curl_setopt($handle, CURLOPT_FOLLOWLOCATION, TRUE);
    // return the content
    curl_setopt($handle, CURLOPT_RETURNTRANSFER, TRUE);

    /* Get the HTML or whatever is linked in $url. */
    $response = curl_exec($handle);
    // response total time
    $time = curl_getinfo($handle, CURLINFO_TOTAL_TIME);
    /* Check for 404 (file not found). */
    $httpCode = curl_getinfo($handle, CURLINFO_HTTP_CODE);

    curl_close($handle);
    return array($response, $httpCode, $time);
    }

    protected function _printResult($url, $depth, $httpcode, $time)
    {
    ob_end_flush();
    $currentDepth = $this->_depth – $depth;
    $count = count($this->_seen);
    echo “N::$count,CODE::$httpcode,TIME::$time,DEPTH::$currentDepth URL::$url <br>”;
    ob_start();
    flush();
    }

    protected function isValid($url, $depth)
    {
    if (strpos($url, $this->_host) === false
    || $depth === 0
    || isset($this->_seen[$url])
    ) {
    return false;
    }
    foreach ($this->_filter as $excludePath) {
    if (strpos($url, $excludePath) !== false) {
    return false;
    }
    }
    return true;
    }

    public function crawl_page($url, $depth)
    {
    if (!$this->isValid($url, $depth)) {
    return;
    }
    // add to the seen URL
    $this->_seen[$url] = true;
    // get Content and Return Code
    list($content, $httpcode, $time) = $this->_getContent($url);
    // print Result for current Page
    $this->_printResult($url, $depth, $httpcode, $time);
    // process subPages
    $this->_processAnchors($content, $url, $depth);
    }

    public function setHttpAuth($user, $pass)
    {
    $this->_useHttpAuth = true;
    $this->_user = $user;
    $this->_pass = $pass;
    }

    public function addFilterPath($path)
    {
    $this->_filter[] = $path;
    }

    public function run()
    {
    $this->crawl_page($this->_url, $this->_depth);
    }
    }

    // USAGE
    $startURL = ‘http://localhost/crawler/crawler/3/1.php’;
    $depth = 50;
    $username = ‘YOURUSER’;
    $password = ‘YOURPASS’;
    $crawler = new crawler($startURL, $depth);
    $crawler->setHttpAuth($username, $password);
    // Exclude path with the following structure to be processed
    $crawler->addFilterPath(‘customer/account/login/referer’);
    $crawler->run();

    ?>
    ““

    Can someone add a few lines of code to do this so I can learn how to do it ?

    to post a comment
    PHP

    1 Comments(s)

    Copy linkTweet thisAlerts:
    @developer_webauthorAug 31.2020 — @NogDog,

    I'd appreciate input from you, atleast. ;)
    ×

    Success!

    Help @developer_web spread the word by sharing this article on Twitter...

    Tweet This
    Sign in
    Forgot password?
    Sign in with TwitchSign in with GithubCreate Account
    about: ({
    version: 0.1.9 BETA 5.18,
    whats_new: community page,
    up_next: more Davinci•003 tasks,
    coming_soon: events calendar,
    social: @webDeveloperHQ
    });

    legal: ({
    terms: of use,
    privacy: policy
    });
    changelog: (
    version: 0.1.9,
    notes: added community page

    version: 0.1.8,
    notes: added Davinci•003

    version: 0.1.7,
    notes: upvote answers to bounties

    version: 0.1.6,
    notes: article editor refresh
    )...
    recent_tips: (
    tipper: @AriseFacilitySolutions09,
    tipped: article
    amount: 1000 SATS,

    tipper: @Yussuf4331,
    tipped: article
    amount: 1000 SATS,

    tipper: @darkwebsites540,
    tipped: article
    amount: 10 SATS,
    )...