⚝
One Hat Cyber Team
⚝
Your IP:
216.73.216.180
Server IP:
13.127.59.50
Server:
Linux ip-172-31-46-210 5.15.0-1033-aws #37~20.04.1-Ubuntu SMP Fri Mar 17 11:39:30 UTC 2023 x86_64
Server Software:
Apache/2.4.41 (Ubuntu)
PHP Version:
7.4.3-4ubuntu2.29
Buat File
|
Buat Folder
Eksekusi
Dir :
~
/
home
/
ubuntu
/
vendor
/
spatie
/
crawler
/
src
/
View File Name :
LinkAdder.php
<?php namespace Spatie\Crawler; use GuzzleHttp\Psr7\Uri; use InvalidArgumentException; use Psr\Http\Message\UriInterface; use Symfony\Component\DomCrawler\Crawler as DomCrawler; use Symfony\Component\DomCrawler\Link; use Tree\Node\Node; class LinkAdder { /** @var \Spatie\Crawler\Crawler */ protected $crawler; public function __construct(Crawler $crawler) { $this->crawler = $crawler; } public function addFromHtml(string $html, UriInterface $foundOnUrl) { $allLinks = $this->extractLinksFromHtml($html, $foundOnUrl); collect($allLinks) ->filter(function (UriInterface $url) { return $this->hasCrawlableScheme($url); }) ->map(function (UriInterface $url) { return $this->normalizeUrl($url); }) ->filter(function (UriInterface $url) use ($foundOnUrl) { if (! $node = $this->crawler->addToDepthTree($url, $foundOnUrl)) { return false; } return $this->shouldCrawl($node); }) ->filter(function (UriInterface $url) { return strpos($url->getPath(), '/tel:') === false; }) ->each(function (UriInterface $url) use ($foundOnUrl) { if ($this->crawler->maximumCrawlCountReached()) { return; } $crawlUrl = CrawlUrl::create($url, $foundOnUrl); $this->crawler->addToCrawlQueue($crawlUrl); }); } /** * @param string $html * @param \Psr\Http\Message\UriInterface $foundOnUrl * * @return \Illuminate\Support\Collection|\Tightenco\Collect\Support\Collection|null */ protected function extractLinksFromHtml(string $html, UriInterface $foundOnUrl) { $domCrawler = new DomCrawler($html, $foundOnUrl); return collect($domCrawler->filterXpath('//a | //link[@rel="next" or @rel="prev"]')->links()) ->reject(function (Link $link) { if ($this->isInvalidHrefNode($link)) { return true; } if ($this->crawler->mustRejectNofollowLinks() && $link->getNode()->getAttribute('rel') === 'nofollow') { return true; } return false; }) ->map(function (Link $link) { try { return new Uri($link->getUri()); } catch (InvalidArgumentException $exception) { return; } }) ->filter(); } protected function hasCrawlableScheme(UriInterface $uri): bool { return in_array($uri->getScheme(), ['http', 'https']); } protected function normalizeUrl(UriInterface $url): UriInterface { return $url->withFragment(''); } protected function shouldCrawl(Node $node): bool { if ($this->crawler->mustRespectRobots() && ! $this->crawler->getRobotsTxt()->allows($node->getValue(), $this->crawler->getUserAgent())) { return false; } $maximumDepth = $this->crawler->getMaximumDepth(); if (is_null($maximumDepth)) { return true; } return $node->getDepth() <= $maximumDepth; } protected function isInvalidHrefNode(Link $link): bool { if ($link->getNode()->nodeName !== 'a') { return false; } if ($link->getNode()->nextSibling !== null) { return false; } if ($link->getNode()->childNodes->length !== 0) { return false; } return true; } }