4get/scraper/google.php

3449 lines
64 KiB
PHP
Raw Normal View History

2023-07-22 18:41:14 +00:00
<?php
2024-06-13 02:41:02 +00:00
// @TODO check for consent.google.com page, if need be
2023-11-29 15:31:59 +00:00
2023-07-22 18:41:14 +00:00
class google{
public function __construct(){
include "lib/fuckhtml.php";
$this->fuckhtml = new fuckhtml();
2023-11-07 13:04:56 +00:00
include "lib/backend.php";
$this->backend = new backend("google");
2023-07-22 18:41:14 +00:00
}
public function getfilters($page){
$base = [
2024-06-13 02:41:02 +00:00
"country" => [ // gl=<country> (image: cr=countryAF)
"display" => "Country",
"option" => [
"any" => "Instance's country",
"af" => "Afghanistan",
"al" => "Albania",
"dz" => "Algeria",
"as" => "American Samoa",
"ad" => "Andorra",
"ao" => "Angola",
"ai" => "Anguilla",
"aq" => "Antarctica",
"ag" => "Antigua and Barbuda",
"ar" => "Argentina",
"am" => "Armenia",
"aw" => "Aruba",
"au" => "Australia",
"at" => "Austria",
"az" => "Azerbaijan",
"bs" => "Bahamas",
"bh" => "Bahrain",
"bd" => "Bangladesh",
"bb" => "Barbados",
"by" => "Belarus",
"be" => "Belgium",
"bz" => "Belize",
"bj" => "Benin",
"bm" => "Bermuda",
"bt" => "Bhutan",
"bo" => "Bolivia",
"ba" => "Bosnia and Herzegovina",
"bw" => "Botswana",
"bv" => "Bouvet Island",
"br" => "Brazil",
"io" => "British Indian Ocean Territory",
"bn" => "Brunei Darussalam",
"bg" => "Bulgaria",
"bf" => "Burkina Faso",
"bi" => "Burundi",
"kh" => "Cambodia",
"cm" => "Cameroon",
"ca" => "Canada",
"cv" => "Cape Verde",
"ky" => "Cayman Islands",
"cf" => "Central African Republic",
"td" => "Chad",
"cl" => "Chile",
"cn" => "China",
"cx" => "Christmas Island",
"cc" => "Cocos (Keeling) Islands",
"co" => "Colombia",
"km" => "Comoros",
"cg" => "Congo",
"cd" => "Congo, the Democratic Republic",
"ck" => "Cook Islands",
"cr" => "Costa Rica",
"ci" => "Cote D'ivoire",
"hr" => "Croatia",
"cu" => "Cuba",
"cy" => "Cyprus",
"cz" => "Czech Republic",
"dk" => "Denmark",
"dj" => "Djibouti",
"dm" => "Dominica",
"do" => "Dominican Republic",
"ec" => "Ecuador",
"eg" => "Egypt",
"sv" => "El Salvador",
"gq" => "Equatorial Guinea",
"er" => "Eritrea",
"ee" => "Estonia",
"et" => "Ethiopia",
"fk" => "Falkland Islands (Malvinas)",
"fo" => "Faroe Islands",
"fj" => "Fiji",
"fi" => "Finland",
"fr" => "France",
"gf" => "French Guiana",
"pf" => "French Polynesia",
"tf" => "French Southern Territories",
"ga" => "Gabon",
"gm" => "Gambia",
"ge" => "Georgia",
"de" => "Germany",
"gh" => "Ghana",
"gi" => "Gibraltar",
"gr" => "Greece",
"gl" => "Greenland",
"gd" => "Grenada",
"gp" => "Guadeloupe",
"gu" => "Guam",
"gt" => "Guatemala",
"gn" => "Guinea",
"gw" => "Guinea-Bissau",
"gy" => "Guyana",
"ht" => "Haiti",
"hm" => "Heard Island and Mcdonald Islands",
"va" => "Holy See (Vatican City State)",
"hn" => "Honduras",
"hk" => "Hong Kong",
"hu" => "Hungary",
"is" => "Iceland",
"in" => "India",
"id" => "Indonesia",
"ir" => "Iran, Islamic Republic",
"iq" => "Iraq",
"ie" => "Ireland",
"il" => "Israel",
"it" => "Italy",
"jm" => "Jamaica",
"jp" => "Japan",
"jo" => "Jordan",
"kz" => "Kazakhstan",
"ke" => "Kenya",
"ki" => "Kiribati",
"kp" => "Korea, Democratic People's Republic",
"kr" => "Korea, Republic",
"kw" => "Kuwait",
"kg" => "Kyrgyzstan",
"la" => "Lao People's Democratic Republic",
"lv" => "Latvia",
"lb" => "Lebanon",
"ls" => "Lesotho",
"lr" => "Liberia",
"ly" => "Libyan Arab Jamahiriya",
"li" => "Liechtenstein",
"lt" => "Lithuania",
"lu" => "Luxembourg",
"mo" => "Macao",
"mk" => "Macedonia, the Former Yugosalv Republic",
"mg" => "Madagascar",
"mw" => "Malawi",
"my" => "Malaysia",
"mv" => "Maldives",
"ml" => "Mali",
"mt" => "Malta",
"mh" => "Marshall Islands",
"mq" => "Martinique",
"mr" => "Mauritania",
"mu" => "Mauritius",
"yt" => "Mayotte",
"mx" => "Mexico",
"fm" => "Micronesia, Federated States",
"md" => "Moldova, Republic",
"mc" => "Monaco",
"mn" => "Mongolia",
"ms" => "Montserrat",
"ma" => "Morocco",
"mz" => "Mozambique",
"mm" => "Myanmar",
"na" => "Namibia",
"nr" => "Nauru",
"np" => "Nepal",
"nl" => "Netherlands",
"an" => "Netherlands Antilles",
"nc" => "New Caledonia",
"nz" => "New Zealand",
"ni" => "Nicaragua",
"ne" => "Niger",
"ng" => "Nigeria",
"nu" => "Niue",
"nf" => "Norfolk Island",
"mp" => "Northern Mariana Islands",
"no" => "Norway",
"om" => "Oman",
"pk" => "Pakistan",
"pw" => "Palau",
"ps" => "Palestinian Territory, Occupied",
"pa" => "Panama",
"pg" => "Papua New Guinea",
"py" => "Paraguay",
"pe" => "Peru",
"ph" => "Philippines",
"pn" => "Pitcairn",
"pl" => "Poland",
"pt" => "Portugal",
"pr" => "Puerto Rico",
"qa" => "Qatar",
"re" => "Reunion",
"ro" => "Romania",
"ru" => "Russian Federation",
"rw" => "Rwanda",
"sh" => "Saint Helena",
"kn" => "Saint Kitts and Nevis",
"lc" => "Saint Lucia",
"pm" => "Saint Pierre and Miquelon",
"vc" => "Saint Vincent and the Grenadines",
"ws" => "Samoa",
"sm" => "San Marino",
"st" => "Sao Tome and Principe",
"sa" => "Saudi Arabia",
"sn" => "Senegal",
"cs" => "Serbia and Montenegro",
"sc" => "Seychelles",
"sl" => "Sierra Leone",
"sg" => "Singapore",
"sk" => "Slovakia",
"si" => "Slovenia",
"sb" => "Solomon Islands",
"so" => "Somalia",
"za" => "South Africa",
"gs" => "South Georgia and the South Sandwich Islands",
"es" => "Spain",
"lk" => "Sri Lanka",
"sd" => "Sudan",
"sr" => "Suriname",
"sj" => "Svalbard and Jan Mayen",
"sz" => "Swaziland",
"se" => "Sweden",
"ch" => "Switzerland",
"sy" => "Syrian Arab Republic",
"tw" => "Taiwan, Province of China",
"tj" => "Tajikistan",
"tz" => "Tanzania, United Republic",
"th" => "Thailand",
"tl" => "Timor-Leste",
"tg" => "Togo",
"tk" => "Tokelau",
"to" => "Tonga",
"tt" => "Trinidad and Tobago",
"tn" => "Tunisia",
"tr" => "Turkey",
"tm" => "Turkmenistan",
"tc" => "Turks and Caicos Islands",
"tv" => "Tuvalu",
"ug" => "Uganda",
"ua" => "Ukraine",
"ae" => "United Arab Emirates",
"uk" => "United Kingdom",
"us" => "United States",
"um" => "United States Minor Outlying Islands",
"uy" => "Uruguay",
"uz" => "Uzbekistan",
"vu" => "Vanuatu",
"ve" => "Venezuela",
"vn" => "Viet Nam",
"vg" => "Virgin Islands, British",
"vi" => "Virgin Islands, U.S.",
"wf" => "Wallis and Futuna",
"eh" => "Western Sahara",
"ye" => "Yemen",
"zm" => "Zambia",
"zw" => "Zimbabwe"
]
],
"nsfw" => [
"display" => "NSFW",
"option" => [
"yes" => "Yes", // safe=active
"no" => "No" // safe=off
]
]
];
2023-07-22 18:41:14 +00:00
switch($page){
2023-07-26 23:03:06 +00:00
case "web":
return array_merge(
$base,
[
2024-06-13 02:41:02 +00:00
"lang" => [ // lr=<lang> (prefix lang with "lang_")
"display" => "Language",
"option" => [
"any" => "Any language",
"ar" => "Arabic",
"bg" => "Bulgarian",
"ca" => "Catalan",
"cs" => "Czech",
"da" => "Danish",
"de" => "German",
"el" => "Greek",
"en" => "English",
"es" => "Spanish",
"et" => "Estonian",
"fi" => "Finnish",
"fr" => "French",
"hr" => "Croatian",
"hu" => "Hungarian",
"id" => "Indonesian",
"is" => "Icelandic",
"it" => "Italian",
"iw" => "Hebrew",
"ja" => "Japanese",
"ko" => "Korean",
"lt" => "Lithuanian",
"lv" => "Latvian",
"nl" => "Dutch",
"no" => "Norwegian",
"pl" => "Polish",
"pt" => "Portuguese",
"ro" => "Romanian",
"ru" => "Russian",
"sk" => "Slovak",
"sl" => "Slovenian",
"sr" => "Serbian",
"sv" => "Swedish",
"tr" => "Turkish",
"zh-CN" => "Chinese (Simplified)",
"zh-TW" => "Chinese (Traditional)"
]
],
"newer" => [ // tbs
"display" => "Newer than",
"option" => "_DATE"
],
"older" => [
"display" => "Older than",
"option" => "_DATE"
2024-06-13 02:41:02 +00:00
],
"spellcheck" => [
"display" => "Spellcheck",
"option" => [
"yes" => "Yes",
"no" => "No"
]
2023-07-22 18:41:14 +00:00
]
]
);
2023-07-22 18:41:14 +00:00
break;
case "images":
return array_merge(
$base,
[
2024-06-13 02:41:02 +00:00
"time" => [ // tbs=qdr:<time>
"display" => "Time posted",
"option" => [
"any" => "Any time",
"d" => "Past 24 hours",
"w" => "Past week",
"m" => "Past month",
"y" => "Past year"
]
],
2024-06-13 02:41:02 +00:00
"size" => [ // imgsz
"display" => "Size",
"option" => [
"any" => "Any size",
"l" => "Large",
"m" => "Medium",
"i" => "Icon",
"qsvga" => "Larger than 400x300",
"vga" => "Larger than 640x480",
2024-06-13 02:41:02 +00:00
"svga" => "Larger than 800x600",
"xga" => "Larger than 1024x768",
"2mp" => "Larger than 2MP",
"4mp" => "Larger than 4MP",
"6mp" => "Larger than 6MP",
"8mp" => "Larger than 8MP",
"10mp" => "Larger than 10MP",
"12mp" => "Larger than 12MP",
"15mp" => "Larger than 15MP",
"20mp" => "Larger than 20MP",
"40mp" => "Larger than 40MP",
"70mp" => "Larger than 70MP"
]
],
2024-06-13 02:41:02 +00:00
"ratio" => [ // imgar
"display" => "Aspect ratio",
"option" => [
"any" => "Any ratio",
2024-06-13 02:41:02 +00:00
"t|xt" => "Tall",
"s" => "Square",
"w" => "Wide",
"xw" => "Panoramic"
]
],
2024-06-13 02:41:02 +00:00
"color" => [ // imgc
"display" => "Color",
"option" => [
"any" => "Any color",
"color" => "Full color",
2024-06-13 02:41:02 +00:00
"bnw" => "Black & white",
"trans" => "Transparent",
2024-06-13 02:41:02 +00:00
// from here, imgcolor
"red" => "Red",
"orange" => "Orange",
"yellow" => "Yellow",
"green" => "Green",
"teal" => "Teal",
"blue" => "Blue",
"purple" => "Purple",
"pink" => "Pink",
"white" => "White",
"gray" => "Gray",
"black" => "Black",
"brown" => "Brown"
]
],
"type" => [ // tbs=itp:<type>
"display" => "Type",
"option" => [
"any" => "Any type",
"clipart" => "Clip Art",
"lineart" => "Line Drawing",
"animated" => "Animated"
]
],
2024-06-13 02:41:02 +00:00
"format" => [ // as_filetype
"display" => "Format",
"option" => [
"any" => "Any format",
"jpg" => "JPG",
"gif" => "GIF",
"png" => "PNG",
"bmp" => "BMP",
"svg" => "SVG",
"webp" => "WEBP",
"ico" => "ICO",
"craw" => "RAW"
]
],
2024-06-13 02:41:02 +00:00
"rights" => [ // tbs=sur:<rights>
"display" => "Usage rights",
"option" => [
"any" => "Any license",
"cl" => "Creative Commons licenses",
"ol" => "Commercial & other licenses"
]
2023-07-22 18:41:14 +00:00
]
]
);
break;
case "videos":
return array_merge(
$base,
[
2024-06-13 02:41:02 +00:00
"newer" => [ // tbs
"display" => "Newer than",
"option" => "_DATE"
],
"older" => [
"display" => "Older than",
"option" => "_DATE"
],
"duration" => [
"display" => "Duration",
"option" => [
"any" => "Any duration",
"s" => "Short (0-4min)", // tbs=dur:s
"m" => "Medium (4-20min)", // tbs=dur:m
"l" => "Long (20+ min)" // tbs=dur:l
]
],
"quality" => [
"display" => "Quality",
"option" => [
"any" => "Any quality",
"h" => "High quality" // tbs=hq:h
]
],
"captions" => [
"display" => "Captions",
"option" => [
"any" => "No preference",
"yes" => "Closed captioned" // tbs=cc:1
]
]
]
);
break;
case "news":
return array_merge(
$base,
[
2024-06-13 02:41:02 +00:00
"newer" => [ // tbs
"display" => "Newer than",
"option" => "_DATE"
],
"older" => [
"display" => "Older than",
"option" => "_DATE"
],
"sort" => [
"display" => "Sort",
"option" => [
"relevance" => "Relevance",
"date" => "Date" // sbd:1
]
2023-07-22 18:41:14 +00:00
]
]
);
2023-07-22 18:41:14 +00:00
break;
}
}
2025-01-19 19:02:24 +00:00
private function get($proxy, $url, $get = [], $use_lynx = false){
2023-07-22 18:41:14 +00:00
$curlproc = curl_init();
2025-01-19 19:02:24 +00:00
if($use_lynx === false){
$headers = [
"User-Agent: " . config::USER_AGENT,
"Accept: text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8",
"Accept-Language: en-US,en;q=0.5",
"Accept-Encoding: gzip",
"DNT: 1",
//"Cookie: SOCS=CAESNQgCEitib3FfaWRlbnRpdHlmcm9udGVuZHVpc2VydmVyXzIwMjQwMzE3LjA4X3AwGgJlbiAEGgYIgM7orwY",
"Connection: keep-alive",
"Upgrade-Insecure-Requests: 1",
"Sec-Fetch-Dest: document",
"Sec-Fetch-Mode: navigate",
"Sec-Fetch-Site: none",
"Sec-Fetch-User: ?1",
"Priority: u=1",
"TE: trailers"
];
// use http2
curl_setopt($curlproc, CURLOPT_HTTP_VERSION, CURL_HTTP_VERSION_2_0);
}else{
$headers = [
"Accept: text/html, text/plain, text/sgml, */*;q=0.01",
"Accept-Encoding: gzip, compress, bzip2",
"Accept-Language: en",
"User-Agent: Lynx/2.9.0dev.12 libwww-FM/2.14 SSL-MM/1.4.1 GNUTLS/3.7.8"
];
}
2023-07-22 18:41:14 +00:00
if($get !== []){
$get = http_build_query($get);
$url .= "?" . $get;
}
2024-02-26 16:31:52 +00:00
curl_setopt($curlproc, CURLOPT_URL, $url);
2023-07-22 18:41:14 +00:00
curl_setopt($curlproc, CURLOPT_ENCODING, ""); // default encoding
curl_setopt($curlproc, CURLOPT_HTTPHEADER, $headers);
curl_setopt($curlproc, CURLOPT_RETURNTRANSFER, true);
curl_setopt($curlproc, CURLOPT_SSL_VERIFYHOST, 2);
curl_setopt($curlproc, CURLOPT_SSL_VERIFYPEER, true);
curl_setopt($curlproc, CURLOPT_CONNECTTIMEOUT, 30);
curl_setopt($curlproc, CURLOPT_TIMEOUT, 30);
2024-06-13 02:41:02 +00:00
// follow redirects
curl_setopt($curlproc, CURLOPT_FOLLOWLOCATION, true);
2023-11-07 13:04:56 +00:00
$this->backend->assign_proxy($curlproc, $proxy);
2023-07-22 18:41:14 +00:00
$data = curl_exec($curlproc);
if(curl_errno($curlproc)){
throw new Exception(curl_error($curlproc));
}
curl_close($curlproc);
2025-01-19 19:02:24 +00:00
if($use_lynx){
return mb_convert_encoding($data, "UTF-8", "ISO-8859-1");
}
2023-07-22 18:41:14 +00:00
return $data;
}
2024-06-13 02:41:02 +00:00
private function parsepage($html, $pagetype, $search, $proxy, $params){
2023-07-22 18:41:14 +00:00
2024-06-13 02:41:02 +00:00
$out = [
"status" => "ok",
"spelling" => [
"type" => "no_correction",
"using" => null,
"correction" => null
],
"npt" => null,
"answer" => [],
"web" => [],
"image" => [],
"video" => [],
"news" => [],
"related" => []
];
$this->fuckhtml->load($html);
$this->detect_sorry();
// parse all <style> tags
$this->parsestyles();
// get javascript images
$this->scrape_dimg($html);
// get html blobs
preg_match_all(
'/function\(\){window\.jsl\.dh\(\'([^\']+?)\',\'(.+?[^\'])\'\);/',
$html,
$blobs
);
$this->blobs = [];
if(isset($blobs[1])){
2024-06-13 02:41:02 +00:00
for($i=0; $i<count($blobs[1]); $i++){
2024-06-13 02:41:02 +00:00
$this->blobs[$blobs[1][$i]] =
$this->fuckhtml
->parseJsString(
$blobs[2][$i]
);
}
2024-06-13 02:41:02 +00:00
}
$this->scrape_imagearr($html);
//
// load result column
//
2025-01-19 19:02:24 +00:00
2024-06-13 02:41:02 +00:00
$result_div =
$this->fuckhtml
->getElementById(
"center_col",
"div"
);
if($result_div === false){
2024-06-13 02:41:02 +00:00
throw new Exception("Failed to grep result div");
}
$this->fuckhtml->load($result_div);
2025-01-19 19:02:24 +00:00
// important for later
$last_page = false;
2024-06-13 02:41:02 +00:00
//
2025-01-19 19:02:24 +00:00
// Get text results
2024-06-13 02:41:02 +00:00
//
2025-01-19 19:02:24 +00:00
$results =
2024-06-13 02:41:02 +00:00
$this->fuckhtml
2025-01-19 19:02:24 +00:00
->getElementsByClassName(
"g",
"div"
2024-06-13 02:41:02 +00:00
);
2025-01-19 19:02:24 +00:00
$this->skip_next = false;
foreach($results as $result){
if($this->skip_next){
$this->skip_next = false;
continue;
}
$this->fuckhtml->load($result);
2025-01-19 19:02:24 +00:00
$web = [
"title" => null,
"description" => null,
"url" => null,
"date" => null,
"type" => "web",
"thumb" => [
"url" => null,
"ratio" => null
],
"sublink" => [],
"table" => []
];
2025-01-19 19:02:24 +00:00
// Detect presence of sublinks
$g =
2024-06-13 02:41:02 +00:00
$this->fuckhtml
2025-01-19 19:02:24 +00:00
->getElementsByClassName(
"g",
"div"
2024-06-13 02:41:02 +00:00
);
2025-01-19 19:02:24 +00:00
if(count($g) > 0){
// skip on next iteration
$this->skip_next = true;
}
// get title
$h3 =
2024-06-13 02:41:02 +00:00
$this->fuckhtml
2025-01-19 19:02:24 +00:00
->getElementsByTagName(
"h3"
2024-06-13 02:41:02 +00:00
);
2025-01-19 19:02:24 +00:00
if(count($h3) === 0){
2025-01-19 19:02:24 +00:00
continue;
}
$web["title"] =
$this->titledots(
2024-06-13 02:41:02 +00:00
$this->fuckhtml
->getTextContent(
2025-01-19 19:02:24 +00:00
$h3[0]
)
);
// get url
$as =
$this->fuckhtml
->getElementsByTagName(
"a"
);
$web["url"] =
$this->unshiturl(
$as[0]
["attributes"]
["href"]
);
if(
!preg_match(
'/^http/',
$web["url"]
)
){
2025-01-19 19:02:24 +00:00
// skip if invalid url is found
continue;
}
2025-01-19 19:02:24 +00:00
//
// get viewcount, time posted and follower count from <cite> tag
//
$cite =
2024-06-29 23:02:33 +00:00
$this->fuckhtml
2025-01-19 19:02:24 +00:00
->getElementsByTagName(
"cite"
2024-06-29 23:02:33 +00:00
);
2025-01-19 19:02:24 +00:00
if(count($cite) !== 0){
2024-06-29 23:02:33 +00:00
2025-01-19 19:02:24 +00:00
$this->fuckhtml->load($cite[0]);
2024-06-29 23:02:33 +00:00
2025-01-19 19:02:24 +00:00
$spans =
2024-06-29 23:02:33 +00:00
$this->fuckhtml
2025-01-19 19:02:24 +00:00
->getElementsByTagName("span");
2024-06-29 23:02:33 +00:00
2025-01-19 19:02:24 +00:00
if(count($spans) === 0){
2024-06-29 23:02:33 +00:00
2025-01-19 19:02:24 +00:00
$cites =
explode(
"·",
$this->fuckhtml
->getTextContent(
$cite[0]
)
2024-06-29 23:02:33 +00:00
);
2025-01-19 19:02:24 +00:00
foreach($cites as $cite){
$cite = trim($cite);
if(
preg_match(
'/(.+) (views|followers|likes)$/',
$cite,
$match
)
){
$web["table"][ucfirst($match[2])] =
$match[1];
}elseif(
preg_match(
'/ago$/',
$cite
)
){
$web["date"] =
strtotime($cite);
}
}
2024-06-29 23:02:33 +00:00
}
2025-01-19 19:02:24 +00:00
// reset
$this->fuckhtml->load($result);
2024-06-29 23:02:33 +00:00
}
2025-01-19 19:02:24 +00:00
//
// attempt to fetch description cleanly
//
$description =
2024-06-13 02:41:02 +00:00
$this->fuckhtml
2025-01-19 19:02:24 +00:00
->getElementsByAttributeValue(
"style",
"-webkit-line-clamp:2"
2024-06-13 02:41:02 +00:00
);
2025-01-19 19:02:24 +00:00
if(count($description) !== 0){
2025-01-19 19:02:24 +00:00
$web["description"] =
$this->titledots(
2024-06-13 02:41:02 +00:00
$this->fuckhtml
->getTextContent(
2025-01-19 19:02:24 +00:00
$description[0]
)
2024-06-13 02:41:02 +00:00
);
2025-01-19 19:02:24 +00:00
}else{
2025-01-19 19:02:24 +00:00
// use ANOTHER method where the description is a header of the result
$description =
$this->fuckhtml
->getElementsByAttributeValue(
2025-01-19 19:02:24 +00:00
"data-attrid",
"wa:/description"
);
2025-01-19 19:02:24 +00:00
if(count($description) !== 0){
2024-06-13 02:41:02 +00:00
2025-01-19 19:02:24 +00:00
// get date off that shit
$date =
$this->fuckhtml
->getElementsByClassName(
$this->getstyle(
[
2025-01-19 19:02:24 +00:00
"font-size" => "12px",
"line-height" => "1.34",
"display" => "inline-block",
"font-family" => "google sans,arial,sans-serif",
"padding-right" => "0",
"white-space" => "nowrap"
]
),
2025-01-19 19:02:24 +00:00
"span"
);
2025-01-19 19:02:24 +00:00
if(count($date) !== 0){
2024-06-13 02:41:02 +00:00
2025-01-19 19:02:24 +00:00
$description[0]["innerHTML"] =
str_replace(
$date[0]["outerHTML"],
"",
$description[0]["innerHTML"]
);
$web["date"] =
strtotime(
$this->fuckhtml
->getTextContent(
2025-01-19 19:02:24 +00:00
$date[0]
)
);
}
$web["description"] =
$this->fuckhtml
->getTextContent(
$description[0]
);
}else{
// Yes.. You guessed it, use ANOTHER method to get descriptions
// off youtube containers
$description =
$this->fuckhtml
->getElementsByClassName(
$this->getstyle(
[
"-webkit-box-orient" => "vertical",
"display" => "-webkit-box",
"font-size" => "14px",
"-webkit-line-clamp" => "2",
"line-height" => "22px",
"overflow" => "hidden",
"word-break" => "break-word",
"color" => "#4d5156"
]
),
"div"
);
if(count($description) !== 0){
// check for video duration
$duration =
$this->fuckhtml
->getElementsByClassName(
$this->getstyle(
[
"background-color" => "rgba(0,0,0,0.6)",
"color" => "#fff",
"fill" => "#fff"
]
),
"div"
);
if(count($duration) !== 0){
2024-06-13 02:41:02 +00:00
2025-01-19 19:02:24 +00:00
$web["table"]["Duration"] =
$this->fuckhtml
->getTextContent(
$duration[0]
);
2025-01-19 19:02:24 +00:00
}
$web["description"] =
$this->titledots(
html_entity_decode(
$this->fuckhtml
->getTextContent(
2025-01-19 19:02:24 +00:00
$description[0]
)
2025-01-19 19:02:24 +00:00
)
);
2024-06-13 02:41:02 +00:00
2025-01-19 19:02:24 +00:00
// get author + time posted
$info =
$this->fuckhtml
2025-01-19 19:02:24 +00:00
->getElementsByClassName(
$this->getstyle(
[
"color" => "var(" . $this->getcolorvar("#70757a") . ")",
"font-size" => "14px",
"line-height" => "20px",
"margin-top" => "12px"
]
),
"div"
2024-06-13 02:41:02 +00:00
);
2025-01-19 19:02:24 +00:00
if(count($info) !== 0){
2025-01-19 19:02:24 +00:00
$info =
explode(
"·",
$this->fuckhtml
->getTextContent(
$info[0]
)
2025-01-19 19:02:24 +00:00
);
switch(count($info)){
case 3:
$web["table"]["Author"] = trim($info[1]);
$web["date"] = strtotime(trim($info[2]));
break;
case 2:
$web["date"] = strtotime(trim($info[1]));
break;
}
2024-06-13 02:41:02 +00:00
}
}
}
}
2023-07-26 23:03:06 +00:00
2025-01-19 19:02:24 +00:00
//
// get categories of content within the search result
//
$cats =
2024-06-13 02:41:02 +00:00
$this->fuckhtml
2025-01-19 19:02:24 +00:00
->getElementsByAttributeName(
"data-sncf",
2024-06-13 02:41:02 +00:00
"div"
);
2025-01-19 19:02:24 +00:00
foreach($cats as $cat){
2025-01-19 19:02:24 +00:00
$this->fuckhtml->load($cat);
// detect image category
$images =
2024-06-13 02:41:02 +00:00
$this->fuckhtml
->getElementsByTagName(
2025-01-19 19:02:24 +00:00
"img"
2024-06-13 02:41:02 +00:00
);
2025-01-19 19:02:24 +00:00
if(count($images) !== 0){
2024-06-13 02:41:02 +00:00
2025-01-19 19:02:24 +00:00
foreach($images as $image){
2024-06-13 02:41:02 +00:00
2025-01-19 19:02:24 +00:00
if(isset($image["attributes"]["id"])){
// we found an image
2024-06-13 02:41:02 +00:00
2025-01-19 19:02:24 +00:00
if(isset($image["attributes"]["width"])){
2024-06-13 02:41:02 +00:00
2025-01-19 19:02:24 +00:00
$width = (int)$image["attributes"]["width"];
if($width == 110){
$ratio = "1:1";
}elseif($width > 110){
$ratio = "16:9";
}else{
$ratio = "9:16";
}
}else{
$ratio = "1:1";
}
$web["thumb"] = [
"url" => $this->getdimg($image["attributes"]["id"]),
"ratio" => $ratio
];
continue 2;
}
}
}
2025-01-19 19:02:24 +00:00
// Detect rating
$spans_unfiltered =
2024-06-13 02:41:02 +00:00
$this->fuckhtml
->getElementsByTagName(
2025-01-19 19:02:24 +00:00
"span"
);
2025-01-19 19:02:24 +00:00
$spans =
$this->fuckhtml
->getElementsByAttributeName(
"aria-label",
$spans_unfiltered
);
2025-01-19 19:02:24 +00:00
foreach($spans as $span){
2025-01-19 19:02:24 +00:00
if(
preg_match(
'/^Rated/',
$span["attributes"]["aria-label"]
)
){
2024-06-13 02:41:02 +00:00
2025-01-19 19:02:24 +00:00
// found rating
// scrape rating
preg_match(
'/([0-9.]+).*([0-9.]+)/',
$span["attributes"]["aria-label"],
$rating
);
2024-06-13 02:41:02 +00:00
2025-01-19 19:02:24 +00:00
if(isset($rating[1])){
2024-06-13 02:41:02 +00:00
2025-01-19 19:02:24 +00:00
$web["table"]["Rating"] =
$rating[1] . "/" . $rating[2];
}
$has_seen_reviews = 0;
foreach($spans_unfiltered as $span_unfiltered){
if(
preg_match(
'/([0-9,.]+) +([A-z]+)$/',
$this->fuckhtml
->getTextContent(
$span_unfiltered
2024-06-13 02:41:02 +00:00
),
2025-01-19 19:02:24 +00:00
$votes
)
){
$has_seen_reviews++;
$web["table"][ucfirst($votes[2])] = $votes[1];
continue;
}
$text =
$this->fuckhtml
->getTextContent(
$span_unfiltered
);
if(
$text == "&nbsp;&nbsp;&nbsp;" ||
$text == ""
){
break;
}
2024-06-13 02:41:02 +00:00
2025-01-19 19:02:24 +00:00
switch($has_seen_reviews){
case 1:
// scrape price
$web["table"]["Price"] = $text;
$has_seen_reviews++;
break;
case 2:
// scrape platform
$web["table"]["Platform"] = $text;
$has_seen_reviews++;
break;
case 3:
// Scrape type
$web["table"]["Medium"] = $text;
break;
}
2024-06-13 02:41:02 +00:00
}
2025-01-19 19:02:24 +00:00
continue 2;
2024-06-13 02:41:02 +00:00
}
2025-01-19 19:02:24 +00:00
}
// check if its an answer header
$answer_header =
$this->fuckhtml
->getElementsByClassName(
$this->getstyle(
[
"overflow" => "hidden",
"text-overflow" => "ellipsis"
]
),
"span"
);
if(count($answer_header) !== 0){
2024-06-13 02:41:02 +00:00
2025-01-19 19:02:24 +00:00
$link =
2024-06-13 02:41:02 +00:00
$this->fuckhtml
->getElementsByTagName(
"a"
);
2025-01-19 19:02:24 +00:00
$cat["innerHTML"] =
str_replace(
$link[0]["outerHTML"],
"",
$cat["innerHTML"]
2024-06-13 02:41:02 +00:00
);
2025-01-19 19:02:24 +00:00
continue;
}
// we probed everything, assume this is the description
// if we didn't find one cleanly previously
if($web["description"] === null){
$web["description"] =
$this->titledots(
$this->fuckhtml
->getTextContent(
2025-01-19 19:02:24 +00:00
$cat
)
);
}
}
2025-01-19 19:02:24 +00:00
// check if description contains date
$description = explode("", $web["description"], 2);
2025-01-19 19:02:24 +00:00
if(
count($description) === 2 &&
strlen($description[0]) <= 20
){
2025-01-19 19:02:24 +00:00
$date = strtotime($description[0]);
2025-01-19 19:02:24 +00:00
if($date !== false){
2024-06-13 02:41:02 +00:00
2025-01-19 19:02:24 +00:00
$web["date"] = $date;
$web["description"] = ltrim($description[1]);
}
}
2025-01-19 19:02:24 +00:00
// fetch youtube thumbnail
$thumbnail =
$this->fuckhtml
2025-01-19 19:02:24 +00:00
->getElementsByClassName(
$this->getstyle(
[
"border-radius" => "8px",
"height" => "fit-content",
"justify-content" => "center",
"margin-right" => "20px",
"margin-top" => "4px",
"position" => "relative",
"width" => "fit-content"
]
),
"div"
);
2025-01-19 19:02:24 +00:00
if(count($thumbnail) !== 0){
2025-01-19 19:02:24 +00:00
// load thumbnail container
$this->fuckhtml->load($thumbnail[0]);
2025-01-19 19:02:24 +00:00
$image =
2024-06-13 02:41:02 +00:00
$this->fuckhtml
->getElementsByTagName(
"img"
);
2025-01-19 19:02:24 +00:00
if(
count($image) !== 0 &&
isset($image[0]["attributes"]["id"])
){
2025-01-19 19:02:24 +00:00
$web["thumb"] = [
2024-06-13 02:41:02 +00:00
"url" =>
2025-01-19 19:02:24 +00:00
$this->unshit_thumb(
$this->getdimg(
$image[0]["attributes"]["id"]
)
),
"ratio" => "16:9"
2024-06-13 02:41:02 +00:00
];
}
2025-01-19 19:02:24 +00:00
// reset
$this->fuckhtml->load($result);
2024-06-13 02:41:02 +00:00
}
2025-01-19 19:02:24 +00:00
$out["web"][] = $web;
2024-06-13 02:41:02 +00:00
}
2025-01-19 19:02:24 +00:00
// reset
2024-06-13 02:41:02 +00:00
$this->fuckhtml->load($result_div);
2025-01-19 19:02:24 +00:00
//
// craft $npt token
//
if(
$last_page === false &&
count($out["web"]) !== 0
){
if(!isset($params["start"])){
2024-06-13 02:41:02 +00:00
2025-01-19 19:02:24 +00:00
$params["start"] = 20;
}else{
2024-06-29 23:02:33 +00:00
2025-01-19 19:02:24 +00:00
$params["start"] += 20;
2024-06-29 23:02:33 +00:00
}
2025-01-19 19:02:24 +00:00
$out["npt"] =
$this->backend
->store(
json_encode($params),
$pagetype,
$proxy
2024-06-13 02:41:02 +00:00
);
2025-01-19 19:02:24 +00:00
}
return $out;
}
private function scrape_dimg($html){
// get images loaded through javascript
$this->dimg = [];
preg_match_all(
'/function\(\){google\.ldi=({.*?});/',
$html,
$dimg
);
if(isset($dimg[1])){
2024-06-13 02:41:02 +00:00
2025-01-19 19:02:24 +00:00
foreach($dimg[1] as $i){
2024-06-13 02:41:02 +00:00
2025-01-19 19:02:24 +00:00
$tmp = json_decode($i, true);
foreach($tmp as $key => $value){
$this->dimg[$key] =
$this->unshit_thumb(
$value
);
}
2024-06-13 02:41:02 +00:00
}
2025-01-19 19:02:24 +00:00
}
// get additional javascript base64 images
preg_match_all(
'/var s=\'(data:image\/[^\']+)\';var ii=\[((?:\'[^\']+\',?)+)\];/',
$html,
$dimg
);
if(isset($dimg[1])){
2024-06-13 02:41:02 +00:00
2025-01-19 19:02:24 +00:00
for($i=0; $i<count($dimg[1]); $i++){
2024-06-13 02:41:02 +00:00
2025-01-19 19:02:24 +00:00
$delims = explode(",", $dimg[2][$i]);
$string =
2024-06-13 02:41:02 +00:00
$this->fuckhtml
2025-01-19 19:02:24 +00:00
->parseJsString(
$dimg[1][$i]
2024-06-13 02:41:02 +00:00
);
2025-01-19 19:02:24 +00:00
foreach($delims as $delim){
2024-06-13 02:41:02 +00:00
2025-01-19 19:02:24 +00:00
$this->dimg[trim($delim, "'")] = $string;
2024-06-13 02:41:02 +00:00
}
}
2025-01-19 19:02:24 +00:00
}
}
private function scrape_imagearr($html){
// get image links arrays
preg_match_all(
'/\[0,"([^"]+)",\["([^"]+)\",([0-9]+),([0-9]+)\],\["([^"]+)",([0-9]+),([0-9]+)\]/',
$html,
$image_arr
);
$this->image_arr = [];
if(isset($image_arr[1])){
2025-01-19 19:02:24 +00:00
for($i=0; $i<count($image_arr[1]); $i++){
$this->image_arr[$image_arr[1][$i]] =
[
2024-06-29 23:02:33 +00:00
[
2025-01-19 19:02:24 +00:00
"url" =>
$this->fuckhtml
->parseJsString(
$image_arr[5][$i]
),
"width" => (int)$image_arr[7][$i],
"height" => (int)$image_arr[6][$i]
],
[
"url" =>
$this->unshit_thumb(
$this->fuckhtml
->parseJsString(
$image_arr[2][$i]
)
),
"width" => (int)$image_arr[4][$i],
"height" => (int)$image_arr[3][$i]
2024-06-29 23:02:33 +00:00
]
2025-01-19 19:02:24 +00:00
];
}
}
}
private function getdimg($dimg){
return isset($this->dimg[$dimg]) ? $this->dimg[$dimg] : null;
}
private function unshit_thumb($url){
// https://encrypted-tbn0.gstatic.com/images?q=tbn:ANd9GcQINE2vbnNLHXqoZr3RVsaEJFyOsj1_BiBnJch-e1nyz3oia7Aj5xVj
// https://i.ytimg.com/vi/PZVIyA5ER3Y/mqdefault.jpg?sqp=-oaymwEFCJQBEFM&rs=AMzJL3nXeaCpdIar-ltNwl82Y82cIJfphA
$parts = parse_url($url);
if(
isset($parts["host"]) &&
preg_match(
'/tbn.*\.gstatic\.com/',
$parts["host"]
)
){
parse_str($parts["query"], $params);
if(isset($params["q"])){
return "https://" . $parts["host"] . "/images?q=" . $params["q"];
}
}
return $url;
}
private function parsestyles(){
$styles = [];
$style_div =
$this->fuckhtml
->getElementsByTagName(
"style"
);
$raw_styles = "";
foreach($style_div as $style){
$raw_styles .= $style["innerHTML"];
}
// filter out media/keyframe queries
$raw_styles =
preg_replace(
'/@\s*(?!font-face)[^{]+\s*{[\S\s]+?}\s*}/',
"",
$raw_styles
);
// get styles
preg_match_all(
'/(.+?){([\S\s]*?)}/',
$raw_styles,
$matches
);
for($i=0; $i<count($matches[1]); $i++){
// get style values
preg_match_all(
'/([^:;]+):([^;]*?(?:\([^)]+\)[^;]*?)?)(?:;|$)/',
$matches[2][$i],
$values_regex
);
2024-06-29 23:02:33 +00:00
2025-01-19 19:02:24 +00:00
$values = [];
for($k=0; $k<count($values_regex[1]); $k++){
2024-06-29 23:02:33 +00:00
2025-01-19 19:02:24 +00:00
$values[trim($values_regex[1][$k])] =
strtolower(trim($values_regex[2][$k]));
2024-06-29 23:02:33 +00:00
}
2025-01-19 19:02:24 +00:00
$names = explode(",", $matches[1][$i]);
2025-01-19 19:02:24 +00:00
// h1,h2,h3 will each get their own array index
foreach($names as $name){
2024-06-13 02:41:02 +00:00
2025-01-19 19:02:24 +00:00
$name = trim($name, "}\t\n\r\0\x0B");
2024-06-13 02:41:02 +00:00
2025-01-19 19:02:24 +00:00
foreach($values as $key => $value){
2024-06-13 02:41:02 +00:00
2025-01-19 19:02:24 +00:00
$styles[$name][$key] = $value;
2024-06-13 02:41:02 +00:00
}
}
2025-01-19 19:02:24 +00:00
}
foreach($styles as $key => $values){
2024-06-13 02:41:02 +00:00
2025-01-19 19:02:24 +00:00
$styles[$key]["_c"] = count($values);
2024-06-13 02:41:02 +00:00
}
2025-01-19 19:02:24 +00:00
$this->styles = $styles;
2024-06-13 02:41:02 +00:00
2025-01-19 19:02:24 +00:00
// get CSS colors
$this->css_colors = [];
if(isset($this->styles[":root"])){
2024-06-13 02:41:02 +00:00
2025-01-19 19:02:24 +00:00
foreach($this->styles[":root"] as $key => $value){
2024-06-13 02:41:02 +00:00
2025-01-19 19:02:24 +00:00
$this->css_colors[$value] = strtolower($key);
2024-06-13 02:41:02 +00:00
}
}
2025-01-19 19:02:24 +00:00
}
private function getstyle($styles){
2024-06-13 02:41:02 +00:00
2025-01-19 19:02:24 +00:00
$styles["_c"] = count($styles);
2024-06-13 02:41:02 +00:00
2025-01-19 19:02:24 +00:00
foreach($this->styles as $style_key => $style_values){
if(count(array_intersect_assoc($style_values, $styles)) === $styles["_c"] + 1){
2024-06-13 02:41:02 +00:00
2025-01-19 19:02:24 +00:00
$style_key =
explode(" ", $style_key);
2024-06-13 02:41:02 +00:00
2025-01-19 19:02:24 +00:00
$style_key = $style_key[count($style_key) - 1];
return
ltrim(
str_replace(
[".", "#"],
" ",
$style_key
)
);
2024-06-13 02:41:02 +00:00
}
}
2025-01-19 19:02:24 +00:00
return false;
}
private function getcolorvar($color){
2024-06-13 02:41:02 +00:00
2025-01-19 19:02:24 +00:00
if(isset($this->css_colors[$color])){
2024-06-13 02:41:02 +00:00
2025-01-19 19:02:24 +00:00
return $this->css_colors[$color];
2024-06-13 02:41:02 +00:00
}
2025-01-19 19:02:24 +00:00
return null;
}
public function web($get){
2024-06-13 02:41:02 +00:00
2025-01-19 19:02:24 +00:00
if($get["npt"]){
2024-06-13 02:41:02 +00:00
2025-01-19 19:02:24 +00:00
[$get, $proxy] = $this->backend->get($get["npt"], "web");
2024-06-13 02:41:02 +00:00
2025-01-19 19:02:24 +00:00
try{
$html =
$this->get(
$proxy,
"https://www.google.com" . $get,
[],
true
2024-06-13 02:41:02 +00:00
);
2025-01-19 19:02:24 +00:00
}catch(Exception $error){
2024-06-13 02:41:02 +00:00
2025-01-19 19:02:24 +00:00
throw new Exception("Failed to get HTML");
}
}else{
$search = $get["s"];
$country = $get["country"];
$nsfw = $get["nsfw"];
$lang = $get["lang"];
$older = $get["older"];
$newer = $get["newer"];
$spellcheck = $get["spellcheck"];
$proxy = $this->backend->get_ip();
$offset = 0;
$params = [
"q" => $search,
"hl" => "en",
"num" => 20
];
// country
if($country != "any"){
2024-06-13 02:41:02 +00:00
2025-01-19 19:02:24 +00:00
$params["gl"] = $country;
}
// nsfw
$params["safe"] = $nsfw == "yes" ? "off" : "active";
// language
if($lang != "any"){
2024-06-13 02:41:02 +00:00
2025-01-19 19:02:24 +00:00
$params["lr"] = "lang_" . $lang;
2024-06-13 02:41:02 +00:00
}
2025-01-19 19:02:24 +00:00
// generate tbs
$tbs = [];
2024-06-13 02:41:02 +00:00
2025-01-19 19:02:24 +00:00
// get date
$older = $older === false ? null : date("m/d/Y", $older);
$newer = $newer === false ? null : date("m/d/Y", $newer);
2024-06-13 02:41:02 +00:00
if(
2025-01-19 19:02:24 +00:00
$older !== null ||
$newer !== null
){
2024-06-13 02:41:02 +00:00
2025-01-19 19:02:24 +00:00
$tbs["cdr"] = "1";
$tbs["cd_min"] = $newer;
$tbs["cd_max"] = $older;
}
// spellcheck filter
if($spellcheck == "no"){
2024-06-13 02:41:02 +00:00
2025-01-19 19:02:24 +00:00
$params["nfpr"] = "1";
}
if(count($tbs) !== 0){
$params["tbs"] = "";
foreach($tbs as $key => $value){
2025-01-19 19:02:24 +00:00
$params["tbs"] .= $key . ":" . $value . ",";
2024-06-13 02:41:02 +00:00
}
2025-01-19 19:02:24 +00:00
$params["tbs"] = rtrim($params["tbs"], ",");
}
try{
$html =
$this->get(
$proxy,
"https://www.google.com/search",
$params,
true
);
}catch(Exception $error){
throw new Exception("Failed to get HTML");
2024-06-13 02:41:02 +00:00
}
2025-01-19 19:02:24 +00:00
//$html = file_get_contents("scraper/google.html");
2024-06-13 02:41:02 +00:00
}
2025-01-19 19:02:24 +00:00
$out = [
"status" => "ok",
"spelling" => [
"type" => "no_correction",
"using" => null,
"correction" => null
],
"npt" => null,
"answer" => [],
"web" => [],
"image" => [],
"video" => [],
"news" => [],
"related" => []
];
$this->fuckhtml->load($html);
$this->parsestyles();
2024-06-13 02:41:02 +00:00
2025-01-19 19:02:24 +00:00
$boxes =
2024-06-13 02:41:02 +00:00
$this->fuckhtml
2025-01-19 19:02:24 +00:00
->getElementsByClassName(
$this->getstyle([
"border" => "thin solid #dadce0",
"padding" => "12px 16px 12px 16px",
"margin-bottom" => "10px",
"font-family" => "sans-serif"
]),
2024-06-13 02:41:02 +00:00
"div"
);
2025-01-19 19:02:24 +00:00
$skip_next = false;
// get next page token
$npt =
$this->fuckhtml
->getElementsByClassName(
$this->getstyle([
"border" => "thin solid #dadce0",
"color" => "#70757a",
"font-size" => "14px",
"text-align" => "center",
"table-layout" => "fixed",
"width" => "100%"
]),
"table"
);
if(count($npt) !== 0){
2024-06-13 02:41:02 +00:00
2025-01-19 19:02:24 +00:00
$this->fuckhtml->load($npt[0]);
2024-06-13 02:41:02 +00:00
2025-01-19 19:02:24 +00:00
$as =
2024-06-13 02:41:02 +00:00
$this->fuckhtml
->getElementsByTagName(
"a"
);
2025-01-19 19:02:24 +00:00
foreach($as as $a){
2024-06-13 02:41:02 +00:00
2025-01-19 19:02:24 +00:00
$text =
2024-06-13 02:41:02 +00:00
$this->fuckhtml
->getTextContent(
$a
);
2025-01-19 19:02:24 +00:00
if(
$text == "Next&nbsp;>" ||
$text == ">"
){
$out["npt"] =
$this->backend->store(
2024-06-13 02:41:02 +00:00
$this->fuckhtml
->getTextContent(
2025-01-19 19:02:24 +00:00
$a["attributes"]["href"]
),
"web",
$proxy
);
}
}
2024-06-13 02:41:02 +00:00
2025-01-19 19:02:24 +00:00
$this->fuckhtml->load($html);
2024-06-13 02:41:02 +00:00
}
2025-01-19 19:02:24 +00:00
$first_box = true;
foreach($boxes as $box){
2024-06-13 02:41:02 +00:00
2025-01-19 19:02:24 +00:00
$this->fuckhtml->load($box);
2024-06-13 02:41:02 +00:00
2025-01-19 19:02:24 +00:00
if($first_box){
2024-06-13 02:41:02 +00:00
2025-01-19 19:02:24 +00:00
//
// Probe for word correction
//
$first_box = false;
2024-06-13 02:41:02 +00:00
2025-01-19 19:02:24 +00:00
$txt =
$this->fuckhtml
->getTextContent($box);
if(
preg_match(
'/^Showing results for /',
$txt
)
){
2024-06-13 02:41:02 +00:00
2025-01-19 19:02:24 +00:00
$as =
2024-06-13 02:41:02 +00:00
$this->fuckhtml
->getElementsByTagName(
2025-01-19 19:02:24 +00:00
"a"
2024-06-13 02:41:02 +00:00
);
2025-01-19 19:02:24 +00:00
if(count($as) === 2){
2024-06-13 02:41:02 +00:00
2025-01-19 19:02:24 +00:00
$out["spelling"] = [
"type" => "including",
"using" =>
2024-06-13 02:41:02 +00:00
$this->fuckhtml
->getTextContent(
2025-01-19 19:02:24 +00:00
$as[0]
2024-06-13 02:41:02 +00:00
),
2025-01-19 19:02:24 +00:00
"correction" =>
2024-06-13 02:41:02 +00:00
$this->fuckhtml
->getTextContent(
2025-01-19 19:02:24 +00:00
$as[1]
)
2024-06-13 02:41:02 +00:00
];
}
2025-01-19 19:02:24 +00:00
continue;
2024-06-13 02:41:02 +00:00
}
}
2025-01-19 19:02:24 +00:00
// probe for custom container
$container_title =
$this->fuckhtml
2025-01-19 19:02:24 +00:00
->getElementsByClassName(
$this->getstyle([
"font-weight" => "bold"
])
);
2025-01-19 19:02:24 +00:00
if(count($container_title) !== 0){
2025-01-19 19:02:24 +00:00
$container_title =
strtolower(
$this->fuckhtml
->getTextContent(
2025-01-19 19:02:24 +00:00
$container_title[0]
)
);
2025-01-19 19:02:24 +00:00
if($container_title == "images"){
//
// Parse image carousel
//
$images =
$this->fuckhtml
2025-01-19 19:02:24 +00:00
->getElementsByClassName(
$this->getstyle([
"display" => "inline-block",
"padding" => "2px",
"padding-bottom" => "4px"
]),
"a"
);
foreach($images as $image){
$this->fuckhtml->load($image);
$image_data =
$this->unshiturl(
$image["attributes"]["href"],
true
);
$img =
$this->fuckhtml
->getElementsByTagName(
"img"
)[0];
$out["image"][] = [
"title" =>
$this->titledots(
$this->fuckhtml
->getTextContent(
$img["attributes"]["alt"]
)
),
"source" => [
[
"url" => $image_data["url"],
"width" => $image_data["image_width"],
"height" => $image_data["image_height"]
],
[
"url" =>
$this->fuckhtml
->getTextContent(
$img["attributes"]["src"]
),
"width" => $image_data["thumb_width"],
"height" => $image_data["thumb_height"]
]
],
"url" => $image_data["ref"]
];
}
continue;
}
2025-01-19 19:02:24 +00:00
if(
$container_title == "related searches" ||
$container_title == "people also search for"
){
$as =
$this->fuckhtml
2025-01-19 19:02:24 +00:00
->getElementsByClassName(
$this->getstyle([
"color" => "#202124",
"font-size" => "13px",
"line-height" => "20px"
]),
"span"
2025-01-19 19:02:24 +00:00
);
foreach($as as $a){
$out["related"][] =
$this->fuckhtml
->getTextContent(
$a
);
}
continue;
}
}
2024-06-13 02:41:02 +00:00
2025-01-19 19:02:24 +00:00
// probe for website link
$link =
2024-06-13 02:41:02 +00:00
$this->fuckhtml
2025-01-19 19:02:24 +00:00
->getElementsByClassName(
$this->getstyle([
"color" => "#1967d2",
"font-size" => "18px",
"line-height" => "24px"
]),
"a"
2024-06-13 02:41:02 +00:00
);
2025-01-19 19:02:24 +00:00
if(count($link) !== 0){
2024-06-13 02:41:02 +00:00
2025-01-19 19:02:24 +00:00
//
// Parse search result
//
$this->fuckhtml->load($link[0]);
$title =
2024-06-13 02:41:02 +00:00
$this->fuckhtml
2025-01-19 19:02:24 +00:00
->getElementsByClassName(
$this->getstyle([
"color" => "#1967d2",
"font-size" => "18px",
"line-height" => "24px"
]),
"span"
);
2025-01-19 19:02:24 +00:00
if(count($title) === 0){
continue;
}
2025-01-19 19:02:24 +00:00
$this->fuckhtml->load($box);
2025-01-19 19:02:24 +00:00
$sublinks = [];
$table = [];
2025-01-19 19:02:24 +00:00
$categories =
$this->fuckhtml
2025-01-19 19:02:24 +00:00
->getElementsByClassName(
$this->getstyle([
"color" => "#202124",
"font-size" => "13px",
"line-height" => "20px"
]),
"span"
);
2025-01-19 19:02:24 +00:00
$i = 0;
foreach($categories as $category){
2025-01-19 19:02:24 +00:00
$this->fuckhtml->load($category);
2025-01-19 19:02:24 +00:00
// probe for sublinks
$subs =
$this->fuckhtml
2025-01-19 19:02:24 +00:00
->getElementsByClassName(
$this->getstyle([
"color" => "#1967d2"
]),
"a"
);
2025-01-19 19:02:24 +00:00
if(count($subs) !== 0){
2025-01-19 19:02:24 +00:00
foreach($subs as $sub){
$url =
$this->unshiturl(
$this->fuckhtml
->getTextContent(
$sub["attributes"]["href"]
)
);
if(
preg_match(
'/^https?:\/\//',
$url
)
){
$sublinks[] = [
"title" =>
$this->titledots(
$this->fuckhtml
->getTextContent(
$sub
)
),
"description" => null,
"url" =>
$this->unshiturl(
$this->fuckhtml
->getTextContent(
$sub["attributes"]["href"]
)
),
"date" => null
];
}
}
unset($categories[$i]);
}
2025-01-19 19:02:24 +00:00
$i++;
}
2024-06-13 02:41:02 +00:00
2025-01-19 19:02:24 +00:00
// get description & date
$date = null;
$categories = array_values($categories);
//print_r($categories);
$c = count($categories) - 1;
$description =
2024-06-13 02:41:02 +00:00
$this->fuckhtml
->getTextContent(
2025-01-19 19:02:24 +00:00
$categories[$c]
);
2024-06-13 02:41:02 +00:00
2025-01-19 19:02:24 +00:00
// remove last category since we're done with it
unset($categories[$c]);
2024-06-13 02:41:02 +00:00
2025-01-19 19:02:24 +00:00
// probe for date
$description_tmp = explode("·", $description, 2);
$date_tmp = strtotime(trim($description_tmp[0]));
2024-06-13 02:41:02 +00:00
2025-01-19 19:02:24 +00:00
if(
count($description_tmp) === 2 &&
strlen($description_tmp[0]) <= 20 &&
$date_tmp !== false
){
$description =
ltrim(
$this->titledots(
$description_tmp[1]
)
);
$date = $date_tmp;
}else{
$description =
$this->titledots(
$description
);
}
2024-06-13 02:41:02 +00:00
2025-01-19 19:02:24 +00:00
// remaining categories should all be greytext
if(count($categories) !== 0){
2024-06-13 02:41:02 +00:00
2025-01-19 19:02:24 +00:00
$texts =
explode(
"·",
preg_replace(
'/\s+/',
" ",
$this->fuckhtml
->getTextContent(
$categories[0]
)
)
2024-06-13 02:41:02 +00:00
);
2025-01-19 19:02:24 +00:00
foreach($texts as $text){
$text = trim($text);
if(
preg_match(
'/^Rating ([0-9.]+)(?: \(([0-9,]+)\))?/',
$text,
$rating
)
){
$table["Rating"] = $rating[1];
if(isset($rating[2])){
$table["Rating"] .= " (" . $rating[2] . " votes)";
}
continue;
}
if(stripos($text, "stock") !== false){
$table["Stock"] = $text;
continue;
}
}
2024-06-13 02:41:02 +00:00
}
2025-01-19 19:02:24 +00:00
$out["web"][] = [
"title" =>
$this->titledots(
$this->fuckhtml
->getTextContent(
$title[0]
)
),
"description" => $description,
"url" =>
$this->unshiturl(
$link[0]["attributes"]["href"]
),
"date" => $date,
"type" => "web",
"thumb" => [
"url" => null,
"ratio" => null
],
"sublink" => $sublinks,
"table" => $table
];
continue;
}
2024-06-13 02:41:02 +00:00
2025-01-19 19:02:24 +00:00
// parse wikipedia heads
$wiki_title =
$this->fuckhtml
->getElementsByClassName(
$this->getstyle([
"color" => "#202124",
"font-size" => "18px",
"line-height" => "24px"
]),
"span"
);
if(count($wiki_title) !== 0){
2024-06-13 02:41:02 +00:00
2025-01-19 19:02:24 +00:00
$wiki_title =
2024-06-13 02:41:02 +00:00
$this->fuckhtml
2025-01-19 19:02:24 +00:00
->getTextContent(
$wiki_title[0]
2024-06-13 02:41:02 +00:00
);
2025-01-19 19:02:24 +00:00
if($wiki_title == "See results about"){
2024-06-13 02:41:02 +00:00
2025-01-19 19:02:24 +00:00
// ignore
continue;
2024-06-13 02:41:02 +00:00
}
2025-01-19 19:02:24 +00:00
if($wiki_title == "Top stories"){
//
// Parse news
//
$tds =
$this->fuckhtml
->getElementsByTagName(
"td"
);
foreach($tds as $td){
$this->fuckhtml->load($td);
$a =
$this->fuckhtml
->getElementsByTagName(
"a"
);
if(count($a) === 0){
continue;
}
$title =
$this->fuckhtml
->getElementsByClassName(
$this->getstyle([
"color" => "#1967d2"
]),
"span"
);
if(count($title) === 0){
continue;
}
$date = null;
$meta_div =
$this->fuckhtml
->getElementsByClassName(
$this->getstyle([
"color" => "#70757a",
"font-size" => "13px",
"line-height" => "20px"
]),
"span"
);
$meta_div =
explode(
"·",
2024-06-13 02:41:02 +00:00
$this->fuckhtml
2025-01-19 19:02:24 +00:00
->getTextContent(
$meta_div[count($meta_div) - 1]
2024-06-13 02:41:02 +00:00
),
2025-01-19 19:02:24 +00:00
2
);
if(count($meta_div) === 2){
$date = strtotime($meta_div[count($meta_div) - 1]);
if($date === false){
$date = null;
}
}
$out["news"][] = [
"title" =>
$this->titledots(
2024-06-13 02:41:02 +00:00
$this->fuckhtml
2025-01-19 19:02:24 +00:00
->getTextContent(
$title[0]
2024-06-13 02:41:02 +00:00
)
),
2025-01-19 19:02:24 +00:00
"description" => null,
"date" => $date,
"thumb" => [
"url" => null,
"ratio" => null
],
"url" =>
$this->unshiturl(
$a[0]["attributes"]["href"]
)
];
}
continue;
}
2024-06-13 02:41:02 +00:00
2025-01-19 19:02:24 +00:00
//
// Parse wikipedia heads
//
2024-06-13 02:41:02 +00:00
2025-01-19 19:02:24 +00:00
$table_div =
$this->fuckhtml
->getElementsByTagName(
"table"
);
2024-06-13 02:41:02 +00:00
2025-01-19 19:02:24 +00:00
if(count($table_div) === 0){
continue;
}
2024-06-13 02:41:02 +00:00
2025-01-19 19:02:24 +00:00
$this->fuckhtml->load($table_div[0]);
// remove table from box
$box["innerHTML"] =
str_replace(
$table_div[0]["outerHTML"],
"",
$box["innerHTML"]
);
// find wiki image
$thumb = null;
$img =
$this->fuckhtml
->getElementsByTagName(
"img"
);
if(count($img) !== 0){
2024-06-13 02:41:02 +00:00
2025-01-19 19:02:24 +00:00
$thumb =
$this->fuckhtml
->getTextContent(
$img[0]["attributes"]["src"]
);
2024-06-13 02:41:02 +00:00
}
2025-01-19 19:02:24 +00:00
$tds =
$this->fuckhtml
->getElementsByTagName(
"td"
);
2024-06-13 02:41:02 +00:00
2025-01-19 19:02:24 +00:00
$description = [];
2024-06-13 02:41:02 +00:00
2025-01-19 19:02:24 +00:00
foreach($tds as $td){
// probe for subtitle
$this->fuckhtml->load($td);
$subtext =
$this->fuckhtml
->getElementsByClassName(
$this->getstyle([
"color" => "#70757a",
"font-size" => "13px",
"line-height" => "20px"
])
);
if(count($subtext) !== 0){
$description[] = [
"type" => "quote",
"value" =>
$this->fuckhtml
->getTextContent(
$subtext[0]
)
];
break;
}
}
2024-06-13 02:41:02 +00:00
2025-01-19 19:02:24 +00:00
$this->fuckhtml->load($box);
// probe for word definition
$lists =
$this->fuckhtml
->getElementsByTagName(
"ol"
2024-06-13 02:41:02 +00:00
);
2025-01-19 19:02:24 +00:00
if(count($lists) !== 0){
$description = [];
foreach($lists as $list){
$box["innerHTML"] =
explode(
$list["outerHTML"],
$box["innerHTML"],
2
);
if(
count($box["innerHTML"]) === 1 ||
trim($box["innerHTML"][0]) == ""
){
break;
}
$description[] = [
"type" => "title",
"value" =>
$this->fuckhtml
->getTextContent(
$box["innerHTML"][0]
)
];
$this->fuckhtml->load($list);
$lis =
$this->fuckhtml
->getElementsByTagName(
"li"
);
$increment = 1;
foreach($lis as $li){
$this->fuckhtml->load($li);
$list_items =
$this->fuckhtml
->getElementsByClassName(
$this->getstyle([
"color" => "#202124",
"font-size" => "13px",
"line-height" => "20px"
])
);
$first_item = true;
foreach($list_items as $it){
if($first_item){
$first_item = false;
$c = count($description);
if(
$c !== 0 &&
$description[$c - 1]["type"] == "text"
){
$description[$c - 1]["value"] .=
"\n\n" .
$increment . ". " . $this->fuckhtml
->getTextContent(
$it
);
}else{
$description[] = [
"type" => "text",
"value" =>
$increment . ". " . $this->fuckhtml
->getTextContent(
$it
)
];
}
}else{
$description[] = [
"type" => "quote",
"value" =>
$this->fuckhtml
->getTextContent(
$it
)
];
}
$increment++;
}
}
$box["innerHTML"] = $box["innerHTML"][1];
}
$out["answer"][] = [
"title" => $wiki_title,
"description" => $description,
"url" => null,
"thumb" => null,
"table" => [],
"sublink" => []
];
continue;
}
2024-06-13 02:41:02 +00:00
2025-01-19 19:02:24 +00:00
// get separator between description and facts
$separator =
$this->fuckhtml
->getElementsByClassName(
$this->getstyle([
"height" => "4px"
]),
"div"
);
2024-06-13 02:41:02 +00:00
2025-01-19 19:02:24 +00:00
$box_html = [];
$table = [];
2024-06-13 02:41:02 +00:00
2025-01-19 19:02:24 +00:00
if(count($separator) !== 0){
$box_html =
explode(
$separator[0]["outerHTML"],
$box["innerHTML"],
2
);
if(count($box_html) === 2){
$box["innerHTML"] = $box_html[0];
}
$this->fuckhtml->load($box_html[1]);
// get all facts
$facts =
$this->fuckhtml
->getElementsByTagName(
"div"
);
foreach($facts as $fact){
if($fact["level"] !== 1){ continue; }
$fact =
explode(
":",
$this->fuckhtml
->getTextContent(
$fact
)
);
$table[trim(preg_replace('/\s+/', " ", $fact[0]))] =
trim(preg_replace('/\s+/', " ", $fact[1]));
}
$this->fuckhtml->load($box);
}
2024-06-13 02:41:02 +00:00
2025-01-19 19:02:24 +00:00
// remove wikipedia link
$wiki_link =
$this->fuckhtml
->getElementsByClassName(
$this->getstyle([
"color" => "#1967d2"
]),
"a"
);
2024-06-13 02:41:02 +00:00
2025-01-19 19:02:24 +00:00
$url = null;
if(count($wiki_link) !== 0){
2024-06-13 02:41:02 +00:00
2025-01-19 19:02:24 +00:00
foreach($wiki_link as $link){
if(
strtolower(
$this->fuckhtml
->getTextContent(
$link
)
) == "wikipedia"
){
$box["innerHTML"] =
str_replace(
$link["outerHTML"],
"",
$box["innerHTML"]
);
$url =
$this->unshiturl(
$link["attributes"]["href"]
);
$this->fuckhtml->load($box);
break;
}
}
2024-06-13 02:41:02 +00:00
}
2025-01-19 19:02:24 +00:00
// remains of box should be description
$description[] = [
"type" => "text",
"value" =>
$this->titledots(
$this->fuckhtml
->getTextContent(
$box
)
)
];
$out["answer"][] = [
"title" => $wiki_title,
"description" => $description,
"url" => $url,
"thumb" => $thumb,
"table" => $table,
"sublink" => []
];
2024-06-13 02:41:02 +00:00
}
}
2025-01-19 19:02:24 +00:00
return $out;
2024-06-13 02:41:02 +00:00
}
public function video($get){
if($get["npt"]){
2024-07-25 21:27:31 +00:00
[$params, $proxy] = $this->backend->get($get["npt"], "video");
2024-06-13 02:41:02 +00:00
$params = json_decode($params, true);
$search = $params["q"];
}else{
$search = $get["s"];
$country = $get["country"];
$nsfw = $get["nsfw"];
$older = $get["older"];
$newer = $get["newer"];
$duration = $get["duration"];
$quality = $get["quality"];
$captions = $get["captions"];
$proxy = $this->backend->get_ip();
$params = [
"q" => $search,
"tbm" => "vid",
"hl" => "en",
"num" => "20"
];
// country
if($country != "any"){
$params["gl"] = $country;
}
// nsfw
$params["safe"] = $nsfw == "yes" ? "off" : "active";
$tbs = [];
// get date
$older = $older === false ? null : date("m/d/Y", $older);
$newer = $newer === false ? null : date("m/d/Y", $newer);
if(
$older !== null ||
$newer !== null
){
$tbs["cdr"] = "1";
$tbs["cd_min"] = $newer;
$tbs["cd_max"] = $older;
}
// duration
if($duration != "any"){
$tbs[] = "dur:" . $duration;
}
// quality
if($quality != "any"){
$tbs[] = "hq:" . $quality;
}
// captions
if($captions != "any"){
$tbs[] = "cc:" . $captions;
}
// append tbs
if(count($tbs) !== 0){
$params["tbs"] =
implode(",", $tbs);
}
}
try{
$html =
$this->get(
$proxy,
"https://www.google.com/search",
$params
);
}catch(Exception $error){
throw new Exception("Failed to get HTML");
}
2024-06-30 00:51:19 +00:00
//$html = file_get_contents("scraper/google.html");
2024-06-13 02:41:02 +00:00
$response = $this->parsepage($html, "videos", $search, $proxy, $params);
$out = [
"status" => "ok",
"npt" => $response["npt"],
"video" => [],
"author" => [],
"livestream" => [],
"playlist" => [],
"reel" => []
];
foreach($response["web"] as $result){
$out["video"][] = [
"title" => $result["title"],
"description" => $result["description"],
"author" => [
"name" => isset($result["table"]["Author"]) ? $result["table"]["Author"] : null,
"url" => null,
"avatar" => null
],
"date" => $result["date"],
"duration" => isset($result["table"]["Duration"]) ? $this->hms2int($result["table"]["Duration"]) : null,
"views" => null,
"thumb" => $result["thumb"],
"url" => $result["url"]
];
}
return $out;
}
public function news($get){
if($get["npt"]){
[$req, $proxy] = $this->backend->get($get["npt"], "news");
/*parse_str(
parse_url($req, PHP_URL_QUERY),
$search
);*/
try{
$html =
$this->get(
$proxy,
"https://www.google.com" . $req,
[]
);
}catch(Exception $error){
throw new Exception("Failed to get HTML");
}
}else{
$search = $get["s"];
$country = $get["country"];
$nsfw = $get["nsfw"];
$older = $get["older"];
$newer = $get["newer"];
$sort = $get["sort"];
$proxy = $this->backend->get_ip();
$params = [
"q" => $search,
"tbm" => "nws",
"hl" => "en",
"num" => "20"
];
// country
if($country != "any"){
$params["gl"] = $country;
}
// nsfw
$params["safe"] = $nsfw == "yes" ? "off" : "active";
$tbs = [];
// get date
$older = $older === false ? null : date("m/d/Y", $older);
$newer = $newer === false ? null : date("m/d/Y", $newer);
if(
$older !== null ||
$newer !== null
){
2024-06-13 02:41:02 +00:00
$tbs["cdr"] = "1";
$tbs["cd_min"] = $newer;
$tbs["cd_max"] = $older;
}
// relevance
if($sort == "date"){
2024-06-13 02:41:02 +00:00
$tbs["sbd"] = "1";
}
2024-06-13 02:41:02 +00:00
// append tbs
if(count($tbs) !== 0){
2024-06-13 02:41:02 +00:00
$params["tbs"] = "";
2024-06-13 02:41:02 +00:00
foreach($tbs as $key => $value){
2024-06-13 02:41:02 +00:00
$params["tbs"] .= $key . ":" . $value . ",";
}
2024-06-13 02:41:02 +00:00
$params["tbs"] = rtrim($params["tbs"], ",");
}
//$html = file_get_contents("scraper/google-news.html");
$html =
$this->get(
$proxy,
"https://www.google.com/search",
$params
);
}
$out = [
"status" => "ok",
"npt" => null,
"news" => []
];
$this->fuckhtml->load($html);
$this->detect_sorry();
// get images
$this->scrape_dimg($html);
// parse styles
$this->parsestyles();
$center_col =
$this->fuckhtml
->getElementById(
"center_col",
"div"
);
if($center_col === null){
throw new Exception("Could not grep result div");
}
$this->fuckhtml->load($center_col);
// get next page
$npt =
$this->fuckhtml
->getElementById(
"pnnext",
"a"
);
if($npt !== false){
$out["npt"] =
$this->backend->store(
$this->fuckhtml
->getTextContent(
$npt["attributes"]
["href"]
),
"news",
$proxy
);
}
$as =
$this->fuckhtml
->getElementsByAttributeName(
"jsname",
"a"
);
foreach($as as $a){
$this->fuckhtml->load($a);
// get title
$title =
$this->fuckhtml
->getElementsByAttributeValue(
"role",
"heading",
"div"
);
if(count($title) === 0){
continue;
}
$title =
2024-06-13 02:41:02 +00:00
$this->titledots(
$this->fuckhtml
->getTextContent(
$title[0]
)
);
// get thumbnail
$image =
$this->fuckhtml
->getElementsByAttributeName(
"id",
"img"
);
// check for padded title node, if found, we're inside a carousel
$probe =
$this->fuckhtml
->getElementsByClassName(
2024-06-13 02:41:02 +00:00
$this->getstyle(
[
2024-06-13 02:41:02 +00:00
"padding" => "16px 16px 40px 16px"
]
),
"div"
);
2024-06-13 02:41:02 +00:00
if(count($probe) !== 0){
2024-06-13 02:41:02 +00:00
$probe = true;
}else{
2024-06-13 02:41:02 +00:00
$probe = false;
}
if(
count($image) !== 0 &&
!isset($image[0]["attributes"]["width"])
){
2024-06-13 02:41:02 +00:00
$thumb = [
"url" =>
$this->getdimg(
$image[0]["attributes"]["id"]
),
"ratio" => $probe === true ? "16:9" : "1:1"
];
}else{
2024-06-13 02:41:02 +00:00
$thumb = [
"url" => null,
"ratio" => null
];
}
$description = null;
if($probe === false){
2024-06-13 02:41:02 +00:00
$desc_divs =
$this->fuckhtml
->getElementsByAttributeName(
"style",
"div"
);
foreach($desc_divs as $desc){
2024-06-13 02:41:02 +00:00
if(
strpos(
$desc["attributes"]["style"],
"margin-top:"
) !== false
){
2024-06-13 02:41:02 +00:00
$description =
$this->titledots(
$this->fuckhtml
->getTextContent(
$desc
)
);
break;
}
}
}
2024-06-13 02:41:02 +00:00
// get author
$author =
$this->fuckhtml
2024-06-13 02:41:02 +00:00
->getElementsByClassName(
$this->getstyle(
[
"overflow" => "hidden",
"text-align" => "left",
"text-overflow" => "ellipsis",
"white-space" => "nowrap",
"margin-bottom" => "8px"
]
),
"div"
);
2024-06-13 02:41:02 +00:00
if(count($author) !== 0){
2024-06-13 02:41:02 +00:00
$author =
$this->fuckhtml
->getTextContent(
$author[0]
);
}else{
$author = null;
}
2024-06-13 02:41:02 +00:00
// get date
$date = null;
2024-06-13 02:41:02 +00:00
$date_div =
2024-03-15 02:33:01 +00:00
$this->fuckhtml
2024-06-13 02:41:02 +00:00
->getElementsByAttributeName(
"style",
2024-03-15 02:33:01 +00:00
"div"
);
2024-06-13 02:41:02 +00:00
foreach($date_div as $d){
2024-03-15 02:33:01 +00:00
2024-06-13 02:41:02 +00:00
$this->fuckhtml->load($d);
$span =
$this->fuckhtml
->getElementsByTagName(
"span"
);
if(
2024-06-13 02:41:02 +00:00
strpos(
$d["attributes"]["style"],
"bottom:"
) !== false
){
2024-06-13 02:41:02 +00:00
$date =
strtotime(
$this->fuckhtml
->getTextContent(
2024-06-13 02:41:02 +00:00
$span[count($span) - 1]
)
);
2024-06-13 02:41:02 +00:00
break;
}
}
2024-06-13 02:41:02 +00:00
$out["news"][] = [
"title" => $title,
"author" => $author,
"description" => $description,
2024-06-13 02:41:02 +00:00
"date" => $date,
"thumb" => $thumb,
"url" =>
$this->unshiturl(
$a["attributes"]
["href"]
)
];
}
return $out;
}
public function image($get){
// generate parameters
if($get["npt"]){
2023-11-07 13:04:56 +00:00
[$params, $proxy] =
$this->backend->get(
$get["npt"],
"images"
);
2023-11-07 13:04:56 +00:00
$params = json_decode($params, true);
}else{
$search = $get["s"];
2023-11-07 13:04:56 +00:00
if(strlen($search) === 0){
throw new Exception("Search term is empty!");
}
$proxy = $this->backend->get_ip();
$country = $get["country"];
$nsfw = $get["nsfw"];
$time = $get["time"];
$size = $get["size"];
$ratio = $get["ratio"];
$color = $get["color"];
$type = $get["type"];
$format = $get["format"];
$rights = $get["rights"];
$params = [
"q" => $search,
2024-06-13 02:41:02 +00:00
"udm" => "2" // get images
];
2024-06-13 02:41:02 +00:00
// country (image search uses cr instead of gl)
if($country != "any"){
2024-06-13 02:41:02 +00:00
$params["cr"] = "country" . strtoupper($country);
}
// nsfw
$params["safe"] = $nsfw == "yes" ? "off" : "active";
2024-06-13 02:41:02 +00:00
// generate tbs
$tbs = [];
// time
if($time != "any"){
2024-06-13 02:41:02 +00:00
$tbs["qdr"] = $time;
}
// size
if($size != "any"){
2024-06-13 02:41:02 +00:00
$params["imgsz"] = $size;
}
// ratio
if($ratio != "any"){
2024-06-13 02:41:02 +00:00
$params["imgar"] = $ratio;
}
// color
if($color != "any"){
if(
2024-06-13 02:41:02 +00:00
$color == "color" ||
$color == "trans"
){
2024-06-13 02:41:02 +00:00
$params["imgc"] = $color;
}elseif($color == "bnw"){
$params["imgc"] = "gray";
}else{
2024-06-13 02:41:02 +00:00
$tbs["ic"] = "specific";
$tbs["isc"] = $color;
}
}
// type
if($type != "any"){
2024-06-13 02:41:02 +00:00
$tbs["itp"] = $type;
}
// format
if($format != "any"){
2024-06-13 02:41:02 +00:00
$params["as_filetype"] = $format;
2023-07-22 18:41:14 +00:00
}
2024-06-13 02:41:02 +00:00
// rights (tbs)
if($rights != "any"){
$tbs["sur"] = $rights;
}
2023-07-26 23:03:06 +00:00
2024-06-13 02:41:02 +00:00
// append tbs
if(count($tbs) !== 0){
2023-07-26 23:03:06 +00:00
2024-06-13 02:41:02 +00:00
$params["tbs"] = "";
foreach($tbs as $key => $value){
2024-06-13 02:41:02 +00:00
$params["tbs"] .= $key . ":" . $value . ",";
}
2024-06-13 02:41:02 +00:00
$params["tbs"] = rtrim($params["tbs"], ",");
}
2024-06-13 02:41:02 +00:00
}
/*
$handle = fopen("scraper/google-img.html", "r");
$html = fread($handle, filesize("scraper/google-img.html"));
fclose($handle);*/
try{
$html =
$this->get(
$proxy,
"https://www.google.com/search",
$params
);
}catch(Exception $error){
2024-06-13 02:41:02 +00:00
throw new Exception("Failed to get search page");
}
2024-06-13 02:41:02 +00:00
$this->fuckhtml->load($html);
$this->detect_sorry();
// get javascript images
$this->scrape_imagearr($html);
$out = [
"status" => "ok",
"npt" => null,
"image" => []
];
$images =
$this->fuckhtml
->getElementsByClassName(
2024-06-13 02:41:02 +00:00
"ivg-i",
"div"
);
2024-06-13 02:41:02 +00:00
foreach($images as $div){
2024-06-13 02:41:02 +00:00
$this->fuckhtml->load($div);
2023-07-26 23:03:06 +00:00
2024-06-13 02:41:02 +00:00
$image =
$this->fuckhtml
2024-06-13 02:41:02 +00:00
->getElementsByTagName("img")[0];
2024-06-13 02:41:02 +00:00
$out["image"][] = [
"title" =>
$this->titledots(
$this->fuckhtml
->getTextContent(
2024-06-13 02:41:02 +00:00
$image["attributes"]["alt"]
)
2024-06-13 02:41:02 +00:00
),
"source" =>
$this->image_arr[
$div["attributes"]["data-docid"]
],
"url" =>
$this->fuckhtml
->getTextContent(
$div["attributes"]["data-lpage"]
)
];
}
// as usual, no way to check if there is a next page reliably
if(count($out["image"]) > 50){
if(!isset($params["start"])){
2024-06-13 02:41:02 +00:00
$params["start"] = 10;
}else{
2024-06-13 02:41:02 +00:00
$params["start"] += 10;
}
2024-06-13 02:41:02 +00:00
$out["npt"] =
$this->backend
->store(
json_encode($params),
"image",
$proxy
);
2023-07-26 23:03:06 +00:00
}
2024-06-13 02:41:02 +00:00
return $out;
2023-07-26 23:03:06 +00:00
}
private function unshiturl($url, $return_size = false){
2023-07-22 18:41:14 +00:00
2024-06-13 02:41:02 +00:00
// decode
$url =
2023-07-22 18:41:14 +00:00
$this->fuckhtml
2025-01-19 19:02:24 +00:00
->getTextContent(
$url
);
2024-06-13 02:41:02 +00:00
$url_parts = parse_url($url);
2025-01-19 19:02:24 +00:00
if(isset($url_parts["query"])){
parse_str($url_parts["query"], $query);
}else{
$query = [];
}
2024-06-13 02:41:02 +00:00
if(
!isset(
$url_parts["host"]
2025-01-19 19:02:24 +00:00
) ||
stripos($url_parts["host"], "google.") !== false
2024-06-13 02:41:02 +00:00
){
2024-06-13 02:41:02 +00:00
// no host, we have a tracking url
if(isset($query["imgurl"])){
$url = $query["imgurl"];
}
elseif(isset($query["q"])){
$url = $query["q"];
}
}
2023-08-08 07:09:47 +00:00
// rewrite URLs to remove extra tracking parameters
2023-08-08 07:09:47 +00:00
$domain = parse_url($url, PHP_URL_HOST);
if(
preg_match(
'/wikipedia.org$/',
$domain
)
){
2023-07-22 18:41:14 +00:00
2023-08-08 07:09:47 +00:00
// rewrite wikipedia mobile URLs to desktop
$url =
$this->replacedomain(
$url,
preg_replace(
'/([a-z0-9]+)(\.m\.)/',
'$1.',
$domain
)
);
2023-07-22 18:41:14 +00:00
}
elseif(
2023-08-08 07:09:47 +00:00
preg_match(
'/imdb\.com$|youtube\.[^.]+$/',
$domain
)
){
// rewrite imdb and youtube mobile URLs too
$url =
$this->replacedomain(
$url,
preg_replace(
'/^m\./',
"",
$domain
)
);
}
elseif(
preg_match(
'/play\.google\.[^.]+$/',
$domain
)
){
// remove referrers from play.google.com
2025-01-19 19:02:24 +00:00
$u_query = parse_url($url, PHP_URL_QUERY);
if($u_query !== null){
2025-01-19 19:02:24 +00:00
parse_str($u_query, $u_query);
if(isset($u_query["referrer"])){ unset($u_query["referrer"]); }
if(isset($u_query["hl"])){ unset($u_query["hl"]); }
if(isset($u_query["gl"])){ unset($u_query["gl"]); }
$query = http_build_query($query);
$url =
str_replace(
2025-01-19 19:02:24 +00:00
$u_query,
$u_query,
$url
);
}
}
elseif(
preg_match(
'/twitter\.com$/',
$domain
)
){
// remove more referrers from twitter.com
2025-01-19 19:02:24 +00:00
$u_query = parse_url($url, PHP_URL_QUERY);
if($u_query !== null){
2025-01-19 19:02:24 +00:00
parse_str($u_query, $u_query);
if(isset($u_query["ref_src"])){ unset($u_query["ref_src"]); }
2025-01-19 19:02:24 +00:00
$u_query = http_build_query($u_query);
$url =
str_replace(
2024-06-13 02:41:02 +00:00
$oldquery,
2025-01-19 19:02:24 +00:00
$u_query,
$url
);
}
}
elseif(
preg_match(
'/maps\.google\.[^.]+/',
$domain
)
){
if(stripos($url, "maps?") !== false){
2025-01-19 19:02:24 +00:00
$u_query = parse_url($url, PHP_URL_QUERY);
2025-01-19 19:02:24 +00:00
if($u_query !== null){
2025-01-19 19:02:24 +00:00
parse_str($u_query, $u_query);
2025-01-19 19:02:24 +00:00
if(isset($u_query["daddr"])){
$url =
"https://maps.google.com/maps?daddr=" .
2025-01-19 19:02:24 +00:00
urlencode($u_query["daddr"]);
}
}
}
}
if($return_size){
return [
"url" => $url,
"ref" => isset($query["imgrefurl"]) ? $query["imgrefurl"] : null,
"thumb_width" => isset($query["tbnw"]) ? (int)$query["tbnw"] : null,
"thumb_height" => isset($query["tbnh"]) ? (int)$query["tbnh"] : null,
"image_width" => isset($query["w"]) ? (int)$query["w"] : null,
"image_height" => isset($query["h"]) ? (int)$query["h"] : null
];
}
2023-08-08 07:09:47 +00:00
return $url;
}
private function replacedomain($url, $domain){
return
preg_replace(
'/(https?:\/\/)([^\/]+)/',
'$1' . $domain,
$url
);
2023-07-22 18:41:14 +00:00
}
private function titledots($title){
2024-06-13 02:41:02 +00:00
return trim($title, " .\t\n\r\0\x0B");
2023-07-22 18:41:14 +00:00
}
2024-02-25 14:51:18 +00:00
2024-06-13 02:41:02 +00:00
private function hms2int($time){
2024-02-25 14:51:18 +00:00
2024-06-13 02:41:02 +00:00
$parts = explode(":", $time, 3);
$time = 0;
2024-02-25 14:51:18 +00:00
2024-06-13 02:41:02 +00:00
if(count($parts) === 3){
2024-03-20 14:59:51 +00:00
2024-06-13 02:41:02 +00:00
// hours
$time = $time + ((int)$parts[0] * 3600);
array_shift($parts);
}
if(count($parts) === 2){
2024-03-20 14:59:51 +00:00
2024-06-13 02:41:02 +00:00
// minutes
$time = $time + ((int)$parts[0] * 60);
array_shift($parts);
2024-02-25 14:51:18 +00:00
}
2024-06-13 02:41:02 +00:00
// seconds
$time = $time + (int)$parts[0];
return $time;
}
private function detect_sorry(){
$recaptcha =
$this->fuckhtml
->getElementById(
"recaptcha",
"div"
);
if($recaptcha !== false){
throw new Exception("Google returned a captcha");
}
2024-02-25 14:51:18 +00:00
}
2023-07-22 18:41:14 +00:00
}