fix scrape failures on google
This commit is contained in:
@@ -522,7 +522,7 @@ class google{
|
|||||||
curl_setopt($curlproc, CURLOPT_HTTPHEADER, [
|
curl_setopt($curlproc, CURLOPT_HTTPHEADER, [
|
||||||
"User-Agent: Mozilla/5.0 (iPhone; CPU iPhone OS 26_0_0 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) GSA/406.0.862495628 Mobile/15E148 Safari/604.1",
|
"User-Agent: Mozilla/5.0 (iPhone; CPU iPhone OS 26_0_0 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) GSA/406.0.862495628 Mobile/15E148 Safari/604.1",
|
||||||
"Accept: text/html, application/xml;q=0.9, */*;q=0.8",
|
"Accept: text/html, application/xml;q=0.9, */*;q=0.8",
|
||||||
"Accept-Language: nl,en;q=0.8",
|
"Accept-Language: en-US,en;q=0.8",
|
||||||
"Accept-Encoding: gzip, deflate",
|
"Accept-Encoding: gzip, deflate",
|
||||||
"Connection: Keep-Alive",
|
"Connection: Keep-Alive",
|
||||||
"Cache-Control: no-cache"
|
"Cache-Control: no-cache"
|
||||||
@@ -770,32 +770,18 @@ class google{
|
|||||||
$this->fuckhtml->load($container);
|
$this->fuckhtml->load($container);
|
||||||
|
|
||||||
// probe for search result
|
// probe for search result
|
||||||
$sprobe =
|
$title =
|
||||||
$this->fuckhtml
|
$this->fuckhtml
|
||||||
->getElementsByAttributeValue(
|
->getElementsByAttributeValue(
|
||||||
"role",
|
"role",
|
||||||
"presentation",
|
"link",
|
||||||
"a"
|
"div"
|
||||||
);
|
);
|
||||||
|
|
||||||
if(count($sprobe) !== 0){
|
if(count($title) !== 0){
|
||||||
|
|
||||||
// we found a search result
|
// we found a search result
|
||||||
|
|
||||||
$title =
|
|
||||||
$this->fuckhtml
|
|
||||||
->getElementsByAttributeValue(
|
|
||||||
"role",
|
|
||||||
"link",
|
|
||||||
"div"
|
|
||||||
);
|
|
||||||
|
|
||||||
if(count($title) === 0){
|
|
||||||
|
|
||||||
// should not happen
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
|
|
||||||
$title =
|
$title =
|
||||||
$this->titledots(
|
$this->titledots(
|
||||||
$this->fuckhtml
|
$this->fuckhtml
|
||||||
@@ -804,6 +790,41 @@ class google{
|
|||||||
)
|
)
|
||||||
);
|
);
|
||||||
|
|
||||||
|
// get url
|
||||||
|
$sprobe =
|
||||||
|
$this->fuckhtml
|
||||||
|
->getElementsByTagName(
|
||||||
|
"a"
|
||||||
|
);
|
||||||
|
|
||||||
|
$link = null;
|
||||||
|
|
||||||
|
foreach($sprobe as $possible_link){
|
||||||
|
|
||||||
|
if(
|
||||||
|
isset($possible_link["attributes"]["href"]) &&
|
||||||
|
preg_match(
|
||||||
|
'/^\/url\?q=/',
|
||||||
|
$possible_link["attributes"]["href"]
|
||||||
|
)
|
||||||
|
){
|
||||||
|
|
||||||
|
$link =
|
||||||
|
$this->fuckhtml
|
||||||
|
->getTextContent(
|
||||||
|
$possible_link["attributes"]["href"]
|
||||||
|
);
|
||||||
|
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if($link === null){
|
||||||
|
|
||||||
|
// should not happen
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
// get description
|
// get description
|
||||||
// as usual, theres a thousand fucking possible divs for this one
|
// as usual, theres a thousand fucking possible divs for this one
|
||||||
|
|
||||||
@@ -968,7 +989,7 @@ class google{
|
|||||||
$out["web"][] = [
|
$out["web"][] = [
|
||||||
"title" => $title,
|
"title" => $title,
|
||||||
"description" => $description,
|
"description" => $description,
|
||||||
"url" => $this->unshiturl($sprobe[0]["attributes"]["href"]),
|
"url" => $this->unshiturl($link),
|
||||||
"date" => $time,
|
"date" => $time,
|
||||||
"type" => "web",
|
"type" => "web",
|
||||||
"thumb" => $thumb,
|
"thumb" => $thumb,
|
||||||
|
|||||||
Reference in New Issue
Block a user