fixed yandex image scraper

This commit is contained in:
lolcat 2023-11-09 08:06:14 -05:00
parent 5236452f45
commit 165d80f80b
1 changed files with 59 additions and 46 deletions

View File

@ -636,6 +636,7 @@ class yandex{
throw new Exception("Failed to get JSON"); throw new Exception("Failed to get JSON");
} }
/* /*
$handle = fopen("scraper/yandex.json", "r"); $handle = fopen("scraper/yandex.json", "r");
$json = fread($handle, filesize("scraper/yandex.json")); $json = fread($handle, filesize("scraper/yandex.json"));
@ -656,31 +657,22 @@ class yandex{
throw new Exception("Failed to decode JSON"); throw new Exception("Failed to decode JSON");
} }
// get html
$html = "";
foreach($json["blocks"] as $block){
$html .= $block["html"];
}
$this->fuckhtml->load($html);
$div = $this->fuckhtml->getElementsByTagName("div");
$out = [ $out = [
"status" => "ok", "status" => "ok",
"npt" => null, "npt" => null,
"image" => [] "image" => []
]; ];
// check for next page // get html
$html = "";
foreach($json["blocks"] as $block){
$html .= $block["html"];
// get next page
if( if(
count( isset($block["params"]["nextPageUrl"]) &&
$this->fuckhtml !empty($block["params"]["nextPageUrl"])
->getElementsByClassName(
"more more_direction_next",
$div
)
) !== 0
){ ){
$request["nsfw"] = $nsfw; $request["nsfw"] = $nsfw;
@ -700,24 +692,45 @@ class yandex{
$proxy $proxy
); );
} }
}
$this->fuckhtml->load($html);
// get search results // get search results
$data = null;
foreach( foreach(
$this->fuckhtml $this->fuckhtml
->getElementsByClassName( ->getElementsByClassName(
"serp-item serp-item_type_search", "Root",
$div "div"
) ) as $div
as $image
){ ){
$image = if(isset($div["attributes"]["data-state"])){
json_decode(
$image $tmp = json_decode(
["attributes"] $this->fuckhtml
["data-bem"], ->getTextContent(
$div["attributes"]["data-state"]
),
true true
)["serp-item"]; );
if(isset($tmp["initialState"]["serpList"])){
$data = $tmp;
break;
}
}
}
if($data === null){
throw new Exception("Failed to extract JSON");
}
foreach($data["initialState"]["serpList"]["items"]["entities"] as $image){
$title = [html_entity_decode($image["snippet"]["title"], ENT_QUOTES | ENT_HTML5)]; $title = [html_entity_decode($image["snippet"]["title"], ENT_QUOTES | ENT_HTML5)];
@ -738,7 +751,7 @@ class yandex{
"url" => htmlspecialchars_decode($image["snippet"]["url"]) "url" => htmlspecialchars_decode($image["snippet"]["url"])
]; ];
foreach($image["dups"] as $dup){ foreach($image["viewerData"]["dups"] as $dup){
$tmp["source"][] = [ $tmp["source"][] = [
"url" => htmlspecialchars_decode($dup["url"]), "url" => htmlspecialchars_decode($dup["url"]),
@ -752,10 +765,10 @@ class yandex{
preg_replace( preg_replace(
'/^\/\//', '/^\/\//',
"https://", "https://",
htmlspecialchars_decode($image["thumb"]["url"]) htmlspecialchars_decode($image["viewerData"]["thumb"]["url"])
), ),
"width" => (int)$image["thumb"]["size"]["width"], "width" => (int)$image["viewerData"]["thumb"]["size"]["width"],
"height" => (int)$image["thumb"]["size"]["height"] "height" => (int)$image["viewerData"]["thumb"]["size"]["height"]
]; ];
$out["image"][] = $tmp; $out["image"][] = $tmp;