package and depencies

This commit is contained in:
RafficMohammed
2023-01-08 02:57:24 +05:30
parent d5332eb421
commit 1d54b8bc7f
4309 changed files with 193331 additions and 172289 deletions

View File

@@ -30,62 +30,44 @@ class Crawler implements \Countable, \IteratorAggregate
/**
* The default namespace prefix to be used with XPath and CSS expressions.
*
* @var string
*/
private $defaultNamespacePrefix = 'default';
private string $defaultNamespacePrefix = 'default';
/**
* A map of manually registered namespaces.
*
* @var array<string, string>
*/
private $namespaces = [];
private array $namespaces = [];
/**
* A map of cached namespaces.
*
* @var \ArrayObject
*/
private $cachedNamespaces;
private \ArrayObject $cachedNamespaces;
/**
* The base href value.
*
* @var string|null
*/
private $baseHref;
/**
* @var \DOMDocument|null
*/
private $document;
private ?string $baseHref;
private ?\DOMDocument $document = null;
/**
* @var list<\DOMNode>
*/
private $nodes = [];
private array $nodes = [];
/**
* Whether the Crawler contains HTML or XML content (used when converting CSS to XPath).
*
* @var bool
*/
private $isHtml = true;
private bool $isHtml = true;
/**
* @var HTML5|null
*/
private $html5Parser;
private HTML5 $html5Parser;
/**
* @param \DOMNodeList|\DOMNode|\DOMNode[]|string|null $node A Node to use as the base for the crawling
*/
public function __construct($node = null, string $uri = null, string $baseHref = null)
public function __construct(\DOMNodeList|\DOMNode|array|string $node = null, string $uri = null, string $baseHref = null)
{
$this->uri = $uri;
$this->baseHref = $baseHref ?: $uri;
$this->html5Parser = class_exists(HTML5::class) ? new HTML5(['disable_html_ns' => true]) : null;
$this->html5Parser = new HTML5(['disable_html_ns' => true]);
$this->cachedNamespaces = new \ArrayObject();
$this->add($node);
@@ -93,20 +75,16 @@ class Crawler implements \Countable, \IteratorAggregate
/**
* Returns the current URI.
*
* @return string|null
*/
public function getUri()
public function getUri(): ?string
{
return $this->uri;
}
/**
* Returns base href.
*
* @return string|null
*/
public function getBaseHref()
public function getBaseHref(): ?string
{
return $this->baseHref;
}
@@ -131,7 +109,7 @@ class Crawler implements \Countable, \IteratorAggregate
*
* @throws \InvalidArgumentException when node is not the expected type
*/
public function add($node)
public function add(\DOMNodeList|\DOMNode|array|string|null $node)
{
if ($node instanceof \DOMNodeList) {
$this->addNodeList($node);
@@ -230,14 +208,11 @@ class Crawler implements \Countable, \IteratorAggregate
public function addXmlContent(string $content, string $charset = 'UTF-8', int $options = \LIBXML_NONET)
{
// remove the default namespace if it's the only namespace to make XPath expressions simpler
if (!preg_match('/xmlns:/', $content)) {
if (!str_contains($content, 'xmlns:')) {
$content = str_replace('xmlns', 'ns', $content);
}
$internalErrors = libxml_use_internal_errors(true);
if (\LIBXML_VERSION < 20900) {
$disableEntities = libxml_disable_entity_loader(true);
}
$dom = new \DOMDocument('1.0', $charset);
$dom->validateOnParse = true;
@@ -247,9 +222,6 @@ class Crawler implements \Countable, \IteratorAggregate
}
libxml_use_internal_errors($internalErrors);
if (\LIBXML_VERSION < 20900) {
libxml_disable_entity_loader($disableEntities);
}
$this->addDocument($dom);
@@ -309,9 +281,7 @@ class Crawler implements \Countable, \IteratorAggregate
throw new \InvalidArgumentException('Attaching DOM nodes from multiple documents in the same crawler is forbidden.');
}
if (null === $this->document) {
$this->document = $node->ownerDocument;
}
$this->document ??= $node->ownerDocument;
// Don't add duplicate nodes in the Crawler
if (\in_array($node, $this->nodes, true)) {
@@ -323,10 +293,8 @@ class Crawler implements \Countable, \IteratorAggregate
/**
* Returns a node given its position in the node list.
*
* @return static
*/
public function eq(int $position)
public function eq(int $position): static
{
if (isset($this->nodes[$position])) {
return $this->createSubCrawler($this->nodes[$position]);
@@ -351,7 +319,7 @@ class Crawler implements \Countable, \IteratorAggregate
*
* @return array An array of values returned by the anonymous function
*/
public function each(\Closure $closure)
public function each(\Closure $closure): array
{
$data = [];
foreach ($this->nodes as $i => $node) {
@@ -363,10 +331,8 @@ class Crawler implements \Countable, \IteratorAggregate
/**
* Slices the list of nodes by $offset and $length.
*
* @return static
*/
public function slice(int $offset = 0, int $length = null)
public function slice(int $offset = 0, int $length = null): static
{
return $this->createSubCrawler(\array_slice($this->nodes, $offset, $length));
}
@@ -377,10 +343,8 @@ class Crawler implements \Countable, \IteratorAggregate
* To remove a node from the list, the anonymous function must return false.
*
* @param \Closure $closure An anonymous function
*
* @return static
*/
public function reduce(\Closure $closure)
public function reduce(\Closure $closure): static
{
$nodes = [];
foreach ($this->nodes as $i => $node) {
@@ -394,20 +358,16 @@ class Crawler implements \Countable, \IteratorAggregate
/**
* Returns the first node of the current selection.
*
* @return static
*/
public function first()
public function first(): static
{
return $this->eq(0);
}
/**
* Returns the last node of the current selection.
*
* @return static
*/
public function last()
public function last(): static
{
return $this->eq(\count($this->nodes) - 1);
}
@@ -415,11 +375,9 @@ class Crawler implements \Countable, \IteratorAggregate
/**
* Returns the siblings nodes of the current selection.
*
* @return static
*
* @throws \InvalidArgumentException When current node is empty
*/
public function siblings()
public function siblings(): static
{
if (!$this->nodes) {
throw new \InvalidArgumentException('The current node list is empty.');
@@ -470,11 +428,9 @@ class Crawler implements \Countable, \IteratorAggregate
/**
* Returns the next siblings nodes of the current selection.
*
* @return static
*
* @throws \InvalidArgumentException When current node is empty
*/
public function nextAll()
public function nextAll(): static
{
if (!$this->nodes) {
throw new \InvalidArgumentException('The current node list is empty.');
@@ -486,11 +442,9 @@ class Crawler implements \Countable, \IteratorAggregate
/**
* Returns the previous sibling nodes of the current selection.
*
* @return static
*
* @throws \InvalidArgumentException
*/
public function previousAll()
public function previousAll(): static
{
if (!$this->nodes) {
throw new \InvalidArgumentException('The current node list is empty.');
@@ -499,28 +453,12 @@ class Crawler implements \Countable, \IteratorAggregate
return $this->createSubCrawler($this->sibling($this->getNode(0), 'previousSibling'));
}
/**
* Returns the parent nodes of the current selection.
*
* @return static
*
* @throws \InvalidArgumentException When current node is empty
*/
public function parents()
{
trigger_deprecation('symfony/dom-crawler', '5.3', 'The %s() method is deprecated, use ancestors() instead.', __METHOD__);
return $this->ancestors();
}
/**
* Returns the ancestors of the current selection.
*
* @return static
*
* @throws \InvalidArgumentException When the current node is empty
*/
public function ancestors()
public function ancestors(): static
{
if (!$this->nodes) {
throw new \InvalidArgumentException('The current node list is empty.');
@@ -541,12 +479,10 @@ class Crawler implements \Countable, \IteratorAggregate
/**
* Returns the children nodes of the current selection.
*
* @return static
*
* @throws \InvalidArgumentException When current node is empty
* @throws \RuntimeException If the CssSelector Component is not available and $selector is provided
*/
public function children(string $selector = null)
public function children(string $selector = null): static
{
if (!$this->nodes) {
throw new \InvalidArgumentException('The current node list is empty.');
@@ -567,11 +503,9 @@ class Crawler implements \Countable, \IteratorAggregate
/**
* Returns the attribute value of the first node of the list.
*
* @return string|null
*
* @throws \InvalidArgumentException When current node is empty
*/
public function attr(string $attribute)
public function attr(string $attribute): ?string
{
if (!$this->nodes) {
throw new \InvalidArgumentException('The current node list is empty.');
@@ -585,11 +519,9 @@ class Crawler implements \Countable, \IteratorAggregate
/**
* Returns the node name of the first node of the list.
*
* @return string
*
* @throws \InvalidArgumentException When current node is empty
*/
public function nodeName()
public function nodeName(): string
{
if (!$this->nodes) {
throw new \InvalidArgumentException('The current node list is empty.');
@@ -606,11 +538,9 @@ class Crawler implements \Countable, \IteratorAggregate
* @param string|null $default When not null: the value to return when the current node is empty
* @param bool $normalizeWhitespace Whether whitespaces should be trimmed and normalized to single spaces
*
* @return string
*
* @throws \InvalidArgumentException When current node is empty
*/
public function text(string $default = null, bool $normalizeWhitespace = true)
public function text(string $default = null, bool $normalizeWhitespace = true): string
{
if (!$this->nodes) {
if (null !== $default) {
@@ -642,11 +572,9 @@ class Crawler implements \Countable, \IteratorAggregate
*
* @param string|null $default When not null: the value to return when the current node is empty
*
* @return string
*
* @throws \InvalidArgumentException When current node is empty
*/
public function html(string $default = null)
public function html(string $default = null): string
{
if (!$this->nodes) {
if (null !== $default) {
@@ -659,7 +587,7 @@ class Crawler implements \Countable, \IteratorAggregate
$node = $this->getNode(0);
$owner = $node->ownerDocument;
if (null !== $this->html5Parser && '<!DOCTYPE html>' === $owner->saveXML($owner->childNodes[0])) {
if ('<!DOCTYPE html>' === $owner->saveXML($owner->childNodes[0])) {
$owner = $this->html5Parser;
}
@@ -680,7 +608,7 @@ class Crawler implements \Countable, \IteratorAggregate
$node = $this->getNode(0);
$owner = $node->ownerDocument;
if (null !== $this->html5Parser && '<!DOCTYPE html>' === $owner->saveXML($owner->childNodes[0])) {
if ('<!DOCTYPE html>' === $owner->saveXML($owner->childNodes[0])) {
$owner = $this->html5Parser;
}
@@ -692,10 +620,8 @@ class Crawler implements \Countable, \IteratorAggregate
*
* Since an XPath expression might evaluate to either a simple type or a \DOMNodeList,
* this method will return either an array of simple types or a new Crawler instance.
*
* @return array|Crawler
*/
public function evaluate(string $xpath)
public function evaluate(string $xpath): array|Crawler
{
if (null === $this->document) {
throw new \LogicException('Cannot evaluate the expression on an uninitialized crawler.');
@@ -723,10 +649,8 @@ class Crawler implements \Countable, \IteratorAggregate
* Example:
*
* $crawler->filter('h1 a')->extract(['_text', 'href']);
*
* @return array
*/
public function extract(array $attributes)
public function extract(array $attributes): array
{
$count = \count($attributes);
@@ -756,10 +680,8 @@ class Crawler implements \Countable, \IteratorAggregate
* is considered as a fake parent of the elements inside it.
* This means that a child selector "div" or "./div" will match only
* the div elements of the current crawler, not their children.
*
* @return static
*/
public function filterXPath(string $xpath)
public function filterXPath(string $xpath): static
{
$xpath = $this->relativize($xpath);
@@ -776,11 +698,9 @@ class Crawler implements \Countable, \IteratorAggregate
*
* This method only works if you have installed the CssSelector Symfony Component.
*
* @return static
*
* @throws \RuntimeException if the CssSelector Component is not available
*/
public function filter(string $selector)
public function filter(string $selector): static
{
$converter = $this->createCssSelectorConverter();
@@ -790,10 +710,8 @@ class Crawler implements \Countable, \IteratorAggregate
/**
* Selects links by name or alt value for clickable images.
*
* @return static
*/
public function selectLink(string $value)
public function selectLink(string $value): static
{
return $this->filterRelativeXPath(
sprintf('descendant-or-self::a[contains(concat(\' \', normalize-space(string(.)), \' \'), %1$s) or ./img[contains(concat(\' \', normalize-space(string(@alt)), \' \'), %1$s)]]', static::xpathLiteral(' '.$value.' '))
@@ -802,10 +720,8 @@ class Crawler implements \Countable, \IteratorAggregate
/**
* Selects images by alt value.
*
* @return static
*/
public function selectImage(string $value)
public function selectImage(string $value): static
{
$xpath = sprintf('descendant-or-self::img[contains(normalize-space(string(@alt)), %s)]', static::xpathLiteral($value));
@@ -814,10 +730,8 @@ class Crawler implements \Countable, \IteratorAggregate
/**
* Selects a button by name or alt value for images.
*
* @return static
*/
public function selectButton(string $value)
public function selectButton(string $value): static
{
return $this->filterRelativeXPath(
sprintf('descendant-or-self::input[((contains(%1$s, "submit") or contains(%1$s, "button")) and contains(concat(\' \', normalize-space(string(@value)), \' \'), %2$s)) or (contains(%1$s, "image") and contains(concat(\' \', normalize-space(string(@alt)), \' \'), %2$s)) or @id=%3$s or @name=%3$s] | descendant-or-self::button[contains(concat(\' \', normalize-space(string(.)), \' \'), %2$s) or @id=%3$s or @name=%3$s]', 'translate(@type, "ABCDEFGHIJKLMNOPQRSTUVWXYZ", "abcdefghijklmnopqrstuvwxyz")', static::xpathLiteral(' '.$value.' '), static::xpathLiteral($value))
@@ -827,11 +741,9 @@ class Crawler implements \Countable, \IteratorAggregate
/**
* Returns a Link object for the first node in the list.
*
* @return Link
*
* @throws \InvalidArgumentException If the current node list is empty or the selected node is not instance of DOMElement
*/
public function link(string $method = 'get')
public function link(string $method = 'get'): Link
{
if (!$this->nodes) {
throw new \InvalidArgumentException('The current node list is empty.');
@@ -853,7 +765,7 @@ class Crawler implements \Countable, \IteratorAggregate
*
* @throws \InvalidArgumentException If the current node list contains non-DOMElement instances
*/
public function links()
public function links(): array
{
$links = [];
foreach ($this->nodes as $node) {
@@ -870,11 +782,9 @@ class Crawler implements \Countable, \IteratorAggregate
/**
* Returns an Image object for the first node in the list.
*
* @return Image
*
* @throws \InvalidArgumentException If the current node list is empty
*/
public function image()
public function image(): Image
{
if (!\count($this)) {
throw new \InvalidArgumentException('The current node list is empty.');
@@ -894,7 +804,7 @@ class Crawler implements \Countable, \IteratorAggregate
*
* @return Image[]
*/
public function images()
public function images(): array
{
$images = [];
foreach ($this as $node) {
@@ -911,11 +821,9 @@ class Crawler implements \Countable, \IteratorAggregate
/**
* Returns a Form object for the first node in the list.
*
* @return Form
*
* @throws \InvalidArgumentException If the current node list is empty or the selected node is not instance of DOMElement
*/
public function form(array $values = null, string $method = null)
public function form(array $values = null, string $method = null): Form
{
if (!$this->nodes) {
throw new \InvalidArgumentException('The current node list is empty.');
@@ -964,10 +872,8 @@ class Crawler implements \Countable, \IteratorAggregate
*
* echo Crawler::xpathLiteral('a\'b"c');
* //prints concat('a', "'", 'b"c')
*
* @return string
*/
public static function xpathLiteral(string $s)
public static function xpathLiteral(string $s): string
{
if (!str_contains($s, "'")) {
return sprintf("'%s'", $s);
@@ -997,10 +903,8 @@ class Crawler implements \Countable, \IteratorAggregate
* Filters the list of nodes with an XPath expression.
*
* The XPath expression should already be processed to apply it in the context of each node.
*
* @return static
*/
private function filterRelativeXPath(string $xpath): object
private function filterRelativeXPath(string $xpath): static
{
$crawler = $this->createSubCrawler(null);
if (null === $this->document) {
@@ -1106,19 +1010,12 @@ class Crawler implements \Countable, \IteratorAggregate
return $xpath; // The XPath expression is invalid
}
/**
* @return \DOMNode|null
*/
public function getNode(int $position)
public function getNode(int $position): ?\DOMNode
{
return $this->nodes[$position] ?? null;
}
/**
* @return int
*/
#[\ReturnTypeWillChange]
public function count()
public function count(): int
{
return \count($this->nodes);
}
@@ -1126,16 +1023,12 @@ class Crawler implements \Countable, \IteratorAggregate
/**
* @return \ArrayIterator<int, \DOMNode>
*/
#[\ReturnTypeWillChange]
public function getIterator()
public function getIterator(): \ArrayIterator
{
return new \ArrayIterator($this->nodes);
}
/**
* @return array
*/
protected function sibling(\DOMNode $node, string $siblingDir = 'nextSibling')
protected function sibling(\DOMNode $node, string $siblingDir = 'nextSibling'): array
{
$nodes = [];
@@ -1159,9 +1052,6 @@ class Crawler implements \Countable, \IteratorAggregate
$htmlContent = $this->convertToHtmlEntities($htmlContent, $charset);
$internalErrors = libxml_use_internal_errors(true);
if (\LIBXML_VERSION < 20900) {
$disableEntities = libxml_disable_entity_loader(true);
}
$dom = new \DOMDocument('1.0', $charset);
$dom->validateOnParse = true;
@@ -1171,9 +1061,6 @@ class Crawler implements \Countable, \IteratorAggregate
}
libxml_use_internal_errors($internalErrors);
if (\LIBXML_VERSION < 20900) {
libxml_disable_entity_loader($disableEntities);
}
return $dom;
}
@@ -1187,11 +1074,11 @@ class Crawler implements \Countable, \IteratorAggregate
try {
return mb_encode_numericentity($htmlContent, [0x80, 0x10FFFF, 0, 0x1FFFFF], $charset);
} catch (\Exception|\ValueError $e) {
} catch (\Exception|\ValueError) {
try {
$htmlContent = iconv($charset, 'UTF-8', $htmlContent);
$htmlContent = mb_encode_numericentity($htmlContent, [0x80, 0x10FFFF, 0, 0x1FFFFF], 'UTF-8');
} catch (\Exception|\ValueError $e) {
} catch (\Exception|\ValueError) {
}
return $htmlContent;
@@ -1249,10 +1136,8 @@ class Crawler implements \Countable, \IteratorAggregate
* Creates a crawler for some subnodes.
*
* @param \DOMNodeList|\DOMNode|\DOMNode[]|string|null $nodes
*
* @return static
*/
private function createSubCrawler($nodes): object
private function createSubCrawler(\DOMNodeList|\DOMNode|array|string|null $nodes): static
{
$crawler = new static($nodes, $this->uri, $this->baseHref);
$crawler->isHtml = $this->isHtml;
@@ -1291,12 +1176,10 @@ class Crawler implements \Countable, \IteratorAggregate
private function canParseHtml5String(string $content): bool
{
if (null === $this->html5Parser) {
return false;
}
if (false === ($pos = stripos($content, '<!doctype html>'))) {
return false;
}
$header = substr($content, 0, $pos);
return '' === $header || $this->isValidHtml5Heading($header);