<?php // @TODO check for consent.google.com page, if need be class google{ public function __construct(){ include "lib/fuckhtml.php"; $this->fuckhtml = new fuckhtml(); include "lib/backend.php"; $this->backend = new backend("google"); } public function getfilters($page){ $base = [ "country" => [ // gl=<country> (image: cr=countryAF) "display" => "Country", "option" => [ "any" => "Instance's country", "af" => "Afghanistan", "al" => "Albania", "dz" => "Algeria", "as" => "American Samoa", "ad" => "Andorra", "ao" => "Angola", "ai" => "Anguilla", "aq" => "Antarctica", "ag" => "Antigua and Barbuda", "ar" => "Argentina", "am" => "Armenia", "aw" => "Aruba", "au" => "Australia", "at" => "Austria", "az" => "Azerbaijan", "bs" => "Bahamas", "bh" => "Bahrain", "bd" => "Bangladesh", "bb" => "Barbados", "by" => "Belarus", "be" => "Belgium", "bz" => "Belize", "bj" => "Benin", "bm" => "Bermuda", "bt" => "Bhutan", "bo" => "Bolivia", "ba" => "Bosnia and Herzegovina", "bw" => "Botswana", "bv" => "Bouvet Island", "br" => "Brazil", "io" => "British Indian Ocean Territory", "bn" => "Brunei Darussalam", "bg" => "Bulgaria", "bf" => "Burkina Faso", "bi" => "Burundi", "kh" => "Cambodia", "cm" => "Cameroon", "ca" => "Canada", "cv" => "Cape Verde", "ky" => "Cayman Islands", "cf" => "Central African Republic", "td" => "Chad", "cl" => "Chile", "cn" => "China", "cx" => "Christmas Island", "cc" => "Cocos (Keeling) Islands", "co" => "Colombia", "km" => "Comoros", "cg" => "Congo", "cd" => "Congo, the Democratic Republic", "ck" => "Cook Islands", "cr" => "Costa Rica", "ci" => "Cote D'ivoire", "hr" => "Croatia", "cu" => "Cuba", "cy" => "Cyprus", "cz" => "Czech Republic", "dk" => "Denmark", "dj" => "Djibouti", "dm" => "Dominica", "do" => "Dominican Republic", "ec" => "Ecuador", "eg" => "Egypt", "sv" => "El Salvador", "gq" => "Equatorial Guinea", "er" => "Eritrea", "ee" => "Estonia", "et" => "Ethiopia", "fk" => "Falkland Islands (Malvinas)", "fo" => "Faroe Islands", "fj" => "Fiji", "fi" => "Finland", "fr" => "France", "gf" => "French Guiana", "pf" => "French Polynesia", "tf" => "French Southern Territories", "ga" => "Gabon", "gm" => "Gambia", "ge" => "Georgia", "de" => "Germany", "gh" => "Ghana", "gi" => "Gibraltar", "gr" => "Greece", "gl" => "Greenland", "gd" => "Grenada", "gp" => "Guadeloupe", "gu" => "Guam", "gt" => "Guatemala", "gn" => "Guinea", "gw" => "Guinea-Bissau", "gy" => "Guyana", "ht" => "Haiti", "hm" => "Heard Island and Mcdonald Islands", "va" => "Holy See (Vatican City State)", "hn" => "Honduras", "hk" => "Hong Kong", "hu" => "Hungary", "is" => "Iceland", "in" => "India", "id" => "Indonesia", "ir" => "Iran, Islamic Republic", "iq" => "Iraq", "ie" => "Ireland", "il" => "Israel", "it" => "Italy", "jm" => "Jamaica", "jp" => "Japan", "jo" => "Jordan", "kz" => "Kazakhstan", "ke" => "Kenya", "ki" => "Kiribati", "kp" => "Korea, Democratic People's Republic", "kr" => "Korea, Republic", "kw" => "Kuwait", "kg" => "Kyrgyzstan", "la" => "Lao People's Democratic Republic", "lv" => "Latvia", "lb" => "Lebanon", "ls" => "Lesotho", "lr" => "Liberia", "ly" => "Libyan Arab Jamahiriya", "li" => "Liechtenstein", "lt" => "Lithuania", "lu" => "Luxembourg", "mo" => "Macao", "mk" => "Macedonia, the Former Yugosalv Republic", "mg" => "Madagascar", "mw" => "Malawi", "my" => "Malaysia", "mv" => "Maldives", "ml" => "Mali", "mt" => "Malta", "mh" => "Marshall Islands", "mq" => "Martinique", "mr" => "Mauritania", "mu" => "Mauritius", "yt" => "Mayotte", "mx" => "Mexico", "fm" => "Micronesia, Federated States", "md" => "Moldova, Republic", "mc" => "Monaco", "mn" => "Mongolia", "ms" => "Montserrat", "ma" => "Morocco", "mz" => "Mozambique", "mm" => "Myanmar", "na" => "Namibia", "nr" => "Nauru", "np" => "Nepal", "nl" => "Netherlands", "an" => "Netherlands Antilles", "nc" => "New Caledonia", "nz" => "New Zealand", "ni" => "Nicaragua", "ne" => "Niger", "ng" => "Nigeria", "nu" => "Niue", "nf" => "Norfolk Island", "mp" => "Northern Mariana Islands", "no" => "Norway", "om" => "Oman", "pk" => "Pakistan", "pw" => "Palau", "ps" => "Palestinian Territory, Occupied", "pa" => "Panama", "pg" => "Papua New Guinea", "py" => "Paraguay", "pe" => "Peru", "ph" => "Philippines", "pn" => "Pitcairn", "pl" => "Poland", "pt" => "Portugal", "pr" => "Puerto Rico", "qa" => "Qatar", "re" => "Reunion", "ro" => "Romania", "ru" => "Russian Federation", "rw" => "Rwanda", "sh" => "Saint Helena", "kn" => "Saint Kitts and Nevis", "lc" => "Saint Lucia", "pm" => "Saint Pierre and Miquelon", "vc" => "Saint Vincent and the Grenadines", "ws" => "Samoa", "sm" => "San Marino", "st" => "Sao Tome and Principe", "sa" => "Saudi Arabia", "sn" => "Senegal", "cs" => "Serbia and Montenegro", "sc" => "Seychelles", "sl" => "Sierra Leone", "sg" => "Singapore", "sk" => "Slovakia", "si" => "Slovenia", "sb" => "Solomon Islands", "so" => "Somalia", "za" => "South Africa", "gs" => "South Georgia and the South Sandwich Islands", "es" => "Spain", "lk" => "Sri Lanka", "sd" => "Sudan", "sr" => "Suriname", "sj" => "Svalbard and Jan Mayen", "sz" => "Swaziland", "se" => "Sweden", "ch" => "Switzerland", "sy" => "Syrian Arab Republic", "tw" => "Taiwan, Province of China", "tj" => "Tajikistan", "tz" => "Tanzania, United Republic", "th" => "Thailand", "tl" => "Timor-Leste", "tg" => "Togo", "tk" => "Tokelau", "to" => "Tonga", "tt" => "Trinidad and Tobago", "tn" => "Tunisia", "tr" => "Turkey", "tm" => "Turkmenistan", "tc" => "Turks and Caicos Islands", "tv" => "Tuvalu", "ug" => "Uganda", "ua" => "Ukraine", "ae" => "United Arab Emirates", "uk" => "United Kingdom", "us" => "United States", "um" => "United States Minor Outlying Islands", "uy" => "Uruguay", "uz" => "Uzbekistan", "vu" => "Vanuatu", "ve" => "Venezuela", "vn" => "Viet Nam", "vg" => "Virgin Islands, British", "vi" => "Virgin Islands, U.S.", "wf" => "Wallis and Futuna", "eh" => "Western Sahara", "ye" => "Yemen", "zm" => "Zambia", "zw" => "Zimbabwe" ] ], "nsfw" => [ "display" => "NSFW", "option" => [ "yes" => "Yes", // safe=active "no" => "No" // safe=off ] ] ]; switch($page){ case "web": return array_merge( $base, [ "lang" => [ // lr=<lang> (prefix lang with "lang_") "display" => "Language", "option" => [ "any" => "Any language", "ar" => "Arabic", "bg" => "Bulgarian", "ca" => "Catalan", "cs" => "Czech", "da" => "Danish", "de" => "German", "el" => "Greek", "en" => "English", "es" => "Spanish", "et" => "Estonian", "fi" => "Finnish", "fr" => "French", "hr" => "Croatian", "hu" => "Hungarian", "id" => "Indonesian", "is" => "Icelandic", "it" => "Italian", "iw" => "Hebrew", "ja" => "Japanese", "ko" => "Korean", "lt" => "Lithuanian", "lv" => "Latvian", "nl" => "Dutch", "no" => "Norwegian", "pl" => "Polish", "pt" => "Portuguese", "ro" => "Romanian", "ru" => "Russian", "sk" => "Slovak", "sl" => "Slovenian", "sr" => "Serbian", "sv" => "Swedish", "tr" => "Turkish", "zh-CN" => "Chinese (Simplified)", "zh-TW" => "Chinese (Traditional)" ] ], "newer" => [ // tbs "display" => "Newer than", "option" => "_DATE" ], "older" => [ "display" => "Older than", "option" => "_DATE" ], "spellcheck" => [ "display" => "Spellcheck", "option" => [ "yes" => "Yes", "no" => "No" ] ] ] ); break; case "images": return array_merge( $base, [ "time" => [ // tbs=qdr:<time> "display" => "Time posted", "option" => [ "any" => "Any time", "d" => "Past 24 hours", "w" => "Past week", "m" => "Past month", "y" => "Past year" ] ], "size" => [ // imgsz "display" => "Size", "option" => [ "any" => "Any size", "l" => "Large", "m" => "Medium", "i" => "Icon", "qsvga" => "Larger than 400x300", "vga" => "Larger than 640x480", "svga" => "Larger than 800x600", "xga" => "Larger than 1024x768", "2mp" => "Larger than 2MP", "4mp" => "Larger than 4MP", "6mp" => "Larger than 6MP", "8mp" => "Larger than 8MP", "10mp" => "Larger than 10MP", "12mp" => "Larger than 12MP", "15mp" => "Larger than 15MP", "20mp" => "Larger than 20MP", "40mp" => "Larger than 40MP", "70mp" => "Larger than 70MP" ] ], "ratio" => [ // imgar "display" => "Aspect ratio", "option" => [ "any" => "Any ratio", "t|xt" => "Tall", "s" => "Square", "w" => "Wide", "xw" => "Panoramic" ] ], "color" => [ // imgc "display" => "Color", "option" => [ "any" => "Any color", "color" => "Full color", "bnw" => "Black & white", "trans" => "Transparent", // from here, imgcolor "red" => "Red", "orange" => "Orange", "yellow" => "Yellow", "green" => "Green", "teal" => "Teal", "blue" => "Blue", "purple" => "Purple", "pink" => "Pink", "white" => "White", "gray" => "Gray", "black" => "Black", "brown" => "Brown" ] ], "type" => [ // tbs=itp:<type> "display" => "Type", "option" => [ "any" => "Any type", "clipart" => "Clip Art", "lineart" => "Line Drawing", "animated" => "Animated" ] ], "format" => [ // as_filetype "display" => "Format", "option" => [ "any" => "Any format", "jpg" => "JPG", "gif" => "GIF", "png" => "PNG", "bmp" => "BMP", "svg" => "SVG", "webp" => "WEBP", "ico" => "ICO", "craw" => "RAW" ] ], "rights" => [ // tbs=sur:<rights> "display" => "Usage rights", "option" => [ "any" => "Any license", "cl" => "Creative Commons licenses", "ol" => "Commercial & other licenses" ] ] ] ); break; case "videos": return array_merge( $base, [ "newer" => [ // tbs "display" => "Newer than", "option" => "_DATE" ], "older" => [ "display" => "Older than", "option" => "_DATE" ], "duration" => [ "display" => "Duration", "option" => [ "any" => "Any duration", "s" => "Short (0-4min)", // tbs=dur:s "m" => "Medium (4-20min)", // tbs=dur:m "l" => "Long (20+ min)" // tbs=dur:l ] ], "quality" => [ "display" => "Quality", "option" => [ "any" => "Any quality", "h" => "High quality" // tbs=hq:h ] ], "captions" => [ "display" => "Captions", "option" => [ "any" => "No preference", "yes" => "Closed captioned" // tbs=cc:1 ] ] ] ); break; case "news": return array_merge( $base, [ "newer" => [ // tbs "display" => "Newer than", "option" => "_DATE" ], "older" => [ "display" => "Older than", "option" => "_DATE" ], "sort" => [ "display" => "Sort", "option" => [ "relevance" => "Relevance", "date" => "Date" // sbd:1 ] ] ] ); break; } } private function get($proxy, $url, $get = [], $use_lynx = false){ $curlproc = curl_init(); if($use_lynx === false){ $headers = [ "User-Agent: " . config::USER_AGENT, "Accept: text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8", "Accept-Language: en-US,en;q=0.5", "Accept-Encoding: gzip", "DNT: 1", //"Cookie: SOCS=CAESNQgCEitib3FfaWRlbnRpdHlmcm9udGVuZHVpc2VydmVyXzIwMjQwMzE3LjA4X3AwGgJlbiAEGgYIgM7orwY", "Connection: keep-alive", "Upgrade-Insecure-Requests: 1", "Sec-Fetch-Dest: document", "Sec-Fetch-Mode: navigate", "Sec-Fetch-Site: none", "Sec-Fetch-User: ?1", "Priority: u=1", "TE: trailers" ]; // use http2 curl_setopt($curlproc, CURLOPT_HTTP_VERSION, CURL_HTTP_VERSION_2_0); }else{ $headers = [ "Accept: text/html, text/plain, text/sgml, */*;q=0.01", "Accept-Encoding: gzip, compress, bzip2", "Accept-Language: en", "User-Agent: Lynx/2.9.0dev.12 libwww-FM/2.14 SSL-MM/1.4.1 GNUTLS/3.7.8" ]; } if($get !== []){ $get = http_build_query($get); $url .= "?" . $get; } curl_setopt($curlproc, CURLOPT_URL, $url); curl_setopt($curlproc, CURLOPT_ENCODING, ""); // default encoding curl_setopt($curlproc, CURLOPT_HTTPHEADER, $headers); curl_setopt($curlproc, CURLOPT_RETURNTRANSFER, true); curl_setopt($curlproc, CURLOPT_SSL_VERIFYHOST, 2); curl_setopt($curlproc, CURLOPT_SSL_VERIFYPEER, true); curl_setopt($curlproc, CURLOPT_CONNECTTIMEOUT, 30); curl_setopt($curlproc, CURLOPT_TIMEOUT, 30); // follow redirects curl_setopt($curlproc, CURLOPT_FOLLOWLOCATION, true); $this->backend->assign_proxy($curlproc, $proxy); $data = curl_exec($curlproc); if(curl_errno($curlproc)){ throw new Exception(curl_error($curlproc)); } curl_close($curlproc); if($use_lynx){ return mb_convert_encoding($data, "UTF-8", "ISO-8859-1"); } return $data; } private function parsepage($html, $pagetype, $search, $proxy, $params){ $out = [ "status" => "ok", "spelling" => [ "type" => "no_correction", "using" => null, "correction" => null ], "npt" => null, "answer" => [], "web" => [], "image" => [], "video" => [], "news" => [], "related" => [] ]; $this->fuckhtml->load($html); $this->detect_sorry(); // parse all <style> tags $this->parsestyles(); // get javascript images $this->scrape_dimg($html); // get html blobs preg_match_all( '/function\(\){window\.jsl\.dh\(\'([^\']+?)\',\'(.+?[^\'])\'\);/', $html, $blobs ); $this->blobs = []; if(isset($blobs[1])){ for($i=0; $i<count($blobs[1]); $i++){ $this->blobs[$blobs[1][$i]] = $this->fuckhtml ->parseJsString( $blobs[2][$i] ); } } $this->scrape_imagearr($html); // // load result column // $result_div = $this->fuckhtml ->getElementById( "center_col", "div" ); if($result_div === false){ throw new Exception("Failed to grep result div"); } $this->fuckhtml->load($result_div); // important for later $last_page = false; // // Get text results // $results = $this->fuckhtml ->getElementsByClassName( "g", "div" ); $this->skip_next = false; foreach($results as $result){ if($this->skip_next){ $this->skip_next = false; continue; } $this->fuckhtml->load($result); $web = [ "title" => null, "description" => null, "url" => null, "date" => null, "type" => "web", "thumb" => [ "url" => null, "ratio" => null ], "sublink" => [], "table" => [] ]; // Detect presence of sublinks $g = $this->fuckhtml ->getElementsByClassName( "g", "div" ); if(count($g) > 0){ // skip on next iteration $this->skip_next = true; } // get title $h3 = $this->fuckhtml ->getElementsByTagName( "h3" ); if(count($h3) === 0){ continue; } $web["title"] = $this->titledots( $this->fuckhtml ->getTextContent( $h3[0] ) ); // get url $as = $this->fuckhtml ->getElementsByTagName( "a" ); $web["url"] = $this->unshiturl( $as[0] ["attributes"] ["href"] ); if( !preg_match( '/^http/', $web["url"] ) ){ // skip if invalid url is found continue; } // // get viewcount, time posted and follower count from <cite> tag // $cite = $this->fuckhtml ->getElementsByTagName( "cite" ); if(count($cite) !== 0){ $this->fuckhtml->load($cite[0]); $spans = $this->fuckhtml ->getElementsByTagName("span"); if(count($spans) === 0){ $cites = explode( "·", $this->fuckhtml ->getTextContent( $cite[0] ) ); foreach($cites as $cite){ $cite = trim($cite); if( preg_match( '/(.+) (views|followers|likes)$/', $cite, $match ) ){ $web["table"][ucfirst($match[2])] = $match[1]; }elseif( preg_match( '/ago$/', $cite ) ){ $web["date"] = strtotime($cite); } } } // reset $this->fuckhtml->load($result); } // // attempt to fetch description cleanly // $description = $this->fuckhtml ->getElementsByAttributeValue( "style", "-webkit-line-clamp:2" ); if(count($description) !== 0){ $web["description"] = $this->titledots( $this->fuckhtml ->getTextContent( $description[0] ) ); }else{ // use ANOTHER method where the description is a header of the result $description = $this->fuckhtml ->getElementsByAttributeValue( "data-attrid", "wa:/description" ); if(count($description) !== 0){ // get date off that shit $date = $this->fuckhtml ->getElementsByClassName( $this->getstyle( [ "font-size" => "12px", "line-height" => "1.34", "display" => "inline-block", "font-family" => "google sans,arial,sans-serif", "padding-right" => "0", "white-space" => "nowrap" ] ), "span" ); if(count($date) !== 0){ $description[0]["innerHTML"] = str_replace( $date[0]["outerHTML"], "", $description[0]["innerHTML"] ); $web["date"] = strtotime( $this->fuckhtml ->getTextContent( $date[0] ) ); } $web["description"] = $this->fuckhtml ->getTextContent( $description[0] ); }else{ // Yes.. You guessed it, use ANOTHER method to get descriptions // off youtube containers $description = $this->fuckhtml ->getElementsByClassName( $this->getstyle( [ "-webkit-box-orient" => "vertical", "display" => "-webkit-box", "font-size" => "14px", "-webkit-line-clamp" => "2", "line-height" => "22px", "overflow" => "hidden", "word-break" => "break-word", "color" => "#4d5156" ] ), "div" ); if(count($description) !== 0){ // check for video duration $duration = $this->fuckhtml ->getElementsByClassName( $this->getstyle( [ "background-color" => "rgba(0,0,0,0.6)", "color" => "#fff", "fill" => "#fff" ] ), "div" ); if(count($duration) !== 0){ $web["table"]["Duration"] = $this->fuckhtml ->getTextContent( $duration[0] ); } $web["description"] = $this->titledots( html_entity_decode( $this->fuckhtml ->getTextContent( $description[0] ) ) ); // get author + time posted $info = $this->fuckhtml ->getElementsByClassName( $this->getstyle( [ "color" => "var(" . $this->getcolorvar("#70757a") . ")", "font-size" => "14px", "line-height" => "20px", "margin-top" => "12px" ] ), "div" ); if(count($info) !== 0){ $info = explode( "·", $this->fuckhtml ->getTextContent( $info[0] ) ); switch(count($info)){ case 3: $web["table"]["Author"] = trim($info[1]); $web["date"] = strtotime(trim($info[2])); break; case 2: $web["date"] = strtotime(trim($info[1])); break; } } } } } // // get categories of content within the search result // $cats = $this->fuckhtml ->getElementsByAttributeName( "data-sncf", "div" ); foreach($cats as $cat){ $this->fuckhtml->load($cat); // detect image category $images = $this->fuckhtml ->getElementsByTagName( "img" ); if(count($images) !== 0){ foreach($images as $image){ if(isset($image["attributes"]["id"])){ // we found an image if(isset($image["attributes"]["width"])){ $width = (int)$image["attributes"]["width"]; if($width == 110){ $ratio = "1:1"; }elseif($width > 110){ $ratio = "16:9"; }else{ $ratio = "9:16"; } }else{ $ratio = "1:1"; } $web["thumb"] = [ "url" => $this->getdimg($image["attributes"]["id"]), "ratio" => $ratio ]; continue 2; } } } // Detect rating $spans_unfiltered = $this->fuckhtml ->getElementsByTagName( "span" ); $spans = $this->fuckhtml ->getElementsByAttributeName( "aria-label", $spans_unfiltered ); foreach($spans as $span){ if( preg_match( '/^Rated/', $span["attributes"]["aria-label"] ) ){ // found rating // scrape rating preg_match( '/([0-9.]+).*([0-9.]+)/', $span["attributes"]["aria-label"], $rating ); if(isset($rating[1])){ $web["table"]["Rating"] = $rating[1] . "/" . $rating[2]; } $has_seen_reviews = 0; foreach($spans_unfiltered as $span_unfiltered){ if( preg_match( '/([0-9,.]+) +([A-z]+)$/', $this->fuckhtml ->getTextContent( $span_unfiltered ), $votes ) ){ $has_seen_reviews++; $web["table"][ucfirst($votes[2])] = $votes[1]; continue; } $text = $this->fuckhtml ->getTextContent( $span_unfiltered ); if( $text == " " || $text == "" ){ break; } switch($has_seen_reviews){ case 1: // scrape price $web["table"]["Price"] = $text; $has_seen_reviews++; break; case 2: // scrape platform $web["table"]["Platform"] = $text; $has_seen_reviews++; break; case 3: // Scrape type $web["table"]["Medium"] = $text; break; } } continue 2; } } // check if its an answer header $answer_header = $this->fuckhtml ->getElementsByClassName( $this->getstyle( [ "overflow" => "hidden", "text-overflow" => "ellipsis" ] ), "span" ); if(count($answer_header) !== 0){ $link = $this->fuckhtml ->getElementsByTagName( "a" ); $cat["innerHTML"] = str_replace( $link[0]["outerHTML"], "", $cat["innerHTML"] ); continue; } // we probed everything, assume this is the description // if we didn't find one cleanly previously if($web["description"] === null){ $web["description"] = $this->titledots( $this->fuckhtml ->getTextContent( $cat ) ); } } // check if description contains date $description = explode("—", $web["description"], 2); if( count($description) === 2 && strlen($description[0]) <= 20 ){ $date = strtotime($description[0]); if($date !== false){ $web["date"] = $date; $web["description"] = ltrim($description[1]); } } // fetch youtube thumbnail $thumbnail = $this->fuckhtml ->getElementsByClassName( $this->getstyle( [ "border-radius" => "8px", "height" => "fit-content", "justify-content" => "center", "margin-right" => "20px", "margin-top" => "4px", "position" => "relative", "width" => "fit-content" ] ), "div" ); if(count($thumbnail) !== 0){ // load thumbnail container $this->fuckhtml->load($thumbnail[0]); $image = $this->fuckhtml ->getElementsByTagName( "img" ); if( count($image) !== 0 && isset($image[0]["attributes"]["id"]) ){ $web["thumb"] = [ "url" => $this->unshit_thumb( $this->getdimg( $image[0]["attributes"]["id"] ) ), "ratio" => "16:9" ]; } // reset $this->fuckhtml->load($result); } $out["web"][] = $web; } // reset $this->fuckhtml->load($result_div); // // craft $npt token // if( $last_page === false && count($out["web"]) !== 0 ){ if(!isset($params["start"])){ $params["start"] = 20; }else{ $params["start"] += 20; } $out["npt"] = $this->backend ->store( json_encode($params), $pagetype, $proxy ); } return $out; } private function scrape_dimg($html){ // get images loaded through javascript $this->dimg = []; preg_match_all( '/function\(\){google\.ldi=({.*?});/', $html, $dimg ); if(isset($dimg[1])){ foreach($dimg[1] as $i){ $tmp = json_decode($i, true); foreach($tmp as $key => $value){ $this->dimg[$key] = $this->unshit_thumb( $value ); } } } // get additional javascript base64 images preg_match_all( '/var s=\'(data:image\/[^\']+)\';var ii=\[((?:\'[^\']+\',?)+)\];/', $html, $dimg ); if(isset($dimg[1])){ for($i=0; $i<count($dimg[1]); $i++){ $delims = explode(",", $dimg[2][$i]); $string = $this->fuckhtml ->parseJsString( $dimg[1][$i] ); foreach($delims as $delim){ $this->dimg[trim($delim, "'")] = $string; } } } } private function scrape_imagearr($html){ // get image links arrays preg_match_all( '/\[0,"([^"]+)",\["([^"]+)\",([0-9]+),([0-9]+)\],\["([^"]+)",([0-9]+),([0-9]+)\]/', $html, $image_arr ); $this->image_arr = []; if(isset($image_arr[1])){ for($i=0; $i<count($image_arr[1]); $i++){ $this->image_arr[$image_arr[1][$i]] = [ [ "url" => $this->fuckhtml ->parseJsString( $image_arr[5][$i] ), "width" => (int)$image_arr[7][$i], "height" => (int)$image_arr[6][$i] ], [ "url" => $this->unshit_thumb( $this->fuckhtml ->parseJsString( $image_arr[2][$i] ) ), "width" => (int)$image_arr[4][$i], "height" => (int)$image_arr[3][$i] ] ]; } } } private function getdimg($dimg){ return isset($this->dimg[$dimg]) ? $this->dimg[$dimg] : null; } private function unshit_thumb($url){ // https://encrypted-tbn0.gstatic.com/images?q=tbn:ANd9GcQINE2vbnNLHXqoZr3RVsaEJFyOsj1_BiBnJch-e1nyz3oia7Aj5xVj // https://i.ytimg.com/vi/PZVIyA5ER3Y/mqdefault.jpg?sqp=-oaymwEFCJQBEFM&rs=AMzJL3nXeaCpdIar-ltNwl82Y82cIJfphA $parts = parse_url($url); if( isset($parts["host"]) && preg_match( '/tbn.*\.gstatic\.com/', $parts["host"] ) ){ parse_str($parts["query"], $params); if(isset($params["q"])){ return "https://" . $parts["host"] . "/images?q=" . $params["q"]; } } return $url; } private function parsestyles(){ $styles = []; $style_div = $this->fuckhtml ->getElementsByTagName( "style" ); $raw_styles = ""; foreach($style_div as $style){ $raw_styles .= $style["innerHTML"]; } // filter out media/keyframe queries $raw_styles = preg_replace( '/@\s*(?!font-face)[^{]+\s*{[\S\s]+?}\s*}/', "", $raw_styles ); // get styles preg_match_all( '/(.+?){([\S\s]*?)}/', $raw_styles, $matches ); for($i=0; $i<count($matches[1]); $i++){ // get style values preg_match_all( '/([^:;]+):([^;]*?(?:\([^)]+\)[^;]*?)?)(?:;|$)/', $matches[2][$i], $values_regex ); $values = []; for($k=0; $k<count($values_regex[1]); $k++){ $values[trim($values_regex[1][$k])] = strtolower(trim($values_regex[2][$k])); } $names = explode(",", $matches[1][$i]); // h1,h2,h3 will each get their own array index foreach($names as $name){ $name = trim($name, "}\t\n\r\0\x0B"); foreach($values as $key => $value){ $styles[$name][$key] = $value; } } } foreach($styles as $key => $values){ $styles[$key]["_c"] = count($values); } $this->styles = $styles; // get CSS colors $this->css_colors = []; if(isset($this->styles[":root"])){ foreach($this->styles[":root"] as $key => $value){ $this->css_colors[$value] = strtolower($key); } } } private function getstyle($styles){ $styles["_c"] = count($styles); foreach($this->styles as $style_key => $style_values){ if(count(array_intersect_assoc($style_values, $styles)) === $styles["_c"] + 1){ $style_key = explode(" ", $style_key); $style_key = $style_key[count($style_key) - 1]; return ltrim( str_replace( [".", "#"], " ", $style_key ) ); } } return false; } private function getcolorvar($color){ if(isset($this->css_colors[$color])){ return $this->css_colors[$color]; } return null; } public function web($get){ if($get["npt"]){ [$get, $proxy] = $this->backend->get($get["npt"], "web"); try{ $html = $this->get( $proxy, "https://www.google.com" . $get, [], true ); }catch(Exception $error){ throw new Exception("Failed to get HTML"); } }else{ $search = $get["s"]; $country = $get["country"]; $nsfw = $get["nsfw"]; $lang = $get["lang"]; $older = $get["older"]; $newer = $get["newer"]; $spellcheck = $get["spellcheck"]; $proxy = $this->backend->get_ip(); $offset = 0; $params = [ "q" => $search, "hl" => "en", "num" => 20 ]; // country if($country != "any"){ $params["gl"] = $country; } // nsfw $params["safe"] = $nsfw == "yes" ? "off" : "active"; // language if($lang != "any"){ $params["lr"] = "lang_" . $lang; } // generate tbs $tbs = []; // get date $older = $older === false ? null : date("m/d/Y", $older); $newer = $newer === false ? null : date("m/d/Y", $newer); if( $older !== null || $newer !== null ){ $tbs["cdr"] = "1"; $tbs["cd_min"] = $newer; $tbs["cd_max"] = $older; } // spellcheck filter if($spellcheck == "no"){ $params["nfpr"] = "1"; } if(count($tbs) !== 0){ $params["tbs"] = ""; foreach($tbs as $key => $value){ $params["tbs"] .= $key . ":" . $value . ","; } $params["tbs"] = rtrim($params["tbs"], ","); } try{ $html = $this->get( $proxy, "https://www.google.com/search", $params, true ); }catch(Exception $error){ throw new Exception("Failed to get HTML"); } //$html = file_get_contents("scraper/google.html"); } $out = [ "status" => "ok", "spelling" => [ "type" => "no_correction", "using" => null, "correction" => null ], "npt" => null, "answer" => [], "web" => [], "image" => [], "video" => [], "news" => [], "related" => [] ]; $this->fuckhtml->load($html); $this->parsestyles(); $boxes = $this->fuckhtml ->getElementsByClassName( $this->getstyle([ "border" => "thin solid #dadce0", "padding" => "12px 16px 12px 16px", "margin-bottom" => "10px", "font-family" => "sans-serif" ]), "div" ); $skip_next = false; // get next page token $npt = $this->fuckhtml ->getElementsByClassName( $this->getstyle([ "border" => "thin solid #dadce0", "color" => "#70757a", "font-size" => "14px", "text-align" => "center", "table-layout" => "fixed", "width" => "100%" ]), "table" ); if(count($npt) !== 0){ $this->fuckhtml->load($npt[0]); $as = $this->fuckhtml ->getElementsByTagName( "a" ); foreach($as as $a){ $text = $this->fuckhtml ->getTextContent( $a ); if( $text == "Next >" || $text == ">" ){ $out["npt"] = $this->backend->store( $this->fuckhtml ->getTextContent( $a["attributes"]["href"] ), "web", $proxy ); } } $this->fuckhtml->load($html); } $first_box = true; foreach($boxes as $box){ $this->fuckhtml->load($box); if($first_box){ // // Probe for word correction // $first_box = false; $txt = $this->fuckhtml ->getTextContent($box); if( preg_match( '/^Showing results for /', $txt ) ){ $as = $this->fuckhtml ->getElementsByTagName( "a" ); if(count($as) === 2){ $out["spelling"] = [ "type" => "including", "using" => $this->fuckhtml ->getTextContent( $as[0] ), "correction" => $this->fuckhtml ->getTextContent( $as[1] ) ]; } continue; } } // probe for custom container $container_title = $this->fuckhtml ->getElementsByClassName( $this->getstyle([ "font-weight" => "bold" ]) ); if(count($container_title) !== 0){ $container_title = strtolower( $this->fuckhtml ->getTextContent( $container_title[0] ) ); if($container_title == "images"){ // // Parse image carousel // $images = $this->fuckhtml ->getElementsByClassName( $this->getstyle([ "display" => "inline-block", "padding" => "2px", "padding-bottom" => "4px" ]), "a" ); foreach($images as $image){ $this->fuckhtml->load($image); $image_data = $this->unshiturl( $image["attributes"]["href"], true ); $img = $this->fuckhtml ->getElementsByTagName( "img" )[0]; $out["image"][] = [ "title" => $this->titledots( $this->fuckhtml ->getTextContent( $img["attributes"]["alt"] ) ), "source" => [ [ "url" => $image_data["url"], "width" => $image_data["image_width"], "height" => $image_data["image_height"] ], [ "url" => $this->fuckhtml ->getTextContent( $img["attributes"]["src"] ), "width" => $image_data["thumb_width"], "height" => $image_data["thumb_height"] ] ], "url" => $image_data["ref"] ]; } continue; } if( $container_title == "related searches" || $container_title == "people also search for" ){ $as = $this->fuckhtml ->getElementsByClassName( $this->getstyle([ "color" => "#202124", "font-size" => "13px", "line-height" => "20px" ]), "span" ); foreach($as as $a){ $out["related"][] = $this->fuckhtml ->getTextContent( $a ); } continue; } } // probe for website link $link = $this->fuckhtml ->getElementsByClassName( $this->getstyle([ "color" => "#1967d2", "font-size" => "18px", "line-height" => "24px" ]), "a" ); if(count($link) !== 0){ // // Parse search result // $this->fuckhtml->load($link[0]); $title = $this->fuckhtml ->getElementsByClassName( $this->getstyle([ "color" => "#1967d2", "font-size" => "18px", "line-height" => "24px" ]), "span" ); if(count($title) === 0){ continue; } $this->fuckhtml->load($box); $sublinks = []; $table = []; $categories = $this->fuckhtml ->getElementsByClassName( $this->getstyle([ "color" => "#202124", "font-size" => "13px", "line-height" => "20px" ]), "span" ); $i = 0; foreach($categories as $category){ $this->fuckhtml->load($category); // probe for sublinks $subs = $this->fuckhtml ->getElementsByClassName( $this->getstyle([ "color" => "#1967d2" ]), "a" ); if(count($subs) !== 0){ foreach($subs as $sub){ $url = $this->unshiturl( $this->fuckhtml ->getTextContent( $sub["attributes"]["href"] ) ); if( preg_match( '/^https?:\/\//', $url ) ){ $sublinks[] = [ "title" => $this->titledots( $this->fuckhtml ->getTextContent( $sub ) ), "description" => null, "url" => $this->unshiturl( $this->fuckhtml ->getTextContent( $sub["attributes"]["href"] ) ), "date" => null ]; } } unset($categories[$i]); } $i++; } // get description & date $date = null; $categories = array_values($categories); //print_r($categories); $c = count($categories) - 1; $description = $this->fuckhtml ->getTextContent( $categories[$c] ); // remove last category since we're done with it unset($categories[$c]); // probe for date $description_tmp = explode("·", $description, 2); $date_tmp = strtotime(trim($description_tmp[0])); if( count($description_tmp) === 2 && strlen($description_tmp[0]) <= 20 && $date_tmp !== false ){ $description = ltrim( $this->titledots( $description_tmp[1] ) ); $date = $date_tmp; }else{ $description = $this->titledots( $description ); } // remaining categories should all be greytext if(count($categories) !== 0){ $texts = explode( "·", preg_replace( '/\s+/', " ", $this->fuckhtml ->getTextContent( $categories[0] ) ) ); foreach($texts as $text){ $text = trim($text); if( preg_match( '/^Rating ([0-9.]+)(?: \(([0-9,]+)\))?/', $text, $rating ) ){ $table["Rating"] = $rating[1]; if(isset($rating[2])){ $table["Rating"] .= " (" . $rating[2] . " votes)"; } continue; } if(stripos($text, "stock") !== false){ $table["Stock"] = $text; continue; } } } $out["web"][] = [ "title" => $this->titledots( $this->fuckhtml ->getTextContent( $title[0] ) ), "description" => $description, "url" => $this->unshiturl( $link[0]["attributes"]["href"] ), "date" => $date, "type" => "web", "thumb" => [ "url" => null, "ratio" => null ], "sublink" => $sublinks, "table" => $table ]; continue; } // parse wikipedia heads $wiki_title = $this->fuckhtml ->getElementsByClassName( $this->getstyle([ "color" => "#202124", "font-size" => "18px", "line-height" => "24px" ]), "span" ); if(count($wiki_title) !== 0){ $wiki_title = $this->fuckhtml ->getTextContent( $wiki_title[0] ); if($wiki_title == "See results about"){ // ignore continue; } if($wiki_title == "Top stories"){ // // Parse news // $tds = $this->fuckhtml ->getElementsByTagName( "td" ); foreach($tds as $td){ $this->fuckhtml->load($td); $a = $this->fuckhtml ->getElementsByTagName( "a" ); if(count($a) === 0){ continue; } $title = $this->fuckhtml ->getElementsByClassName( $this->getstyle([ "color" => "#1967d2" ]), "span" ); if(count($title) === 0){ continue; } $date = null; $meta_div = $this->fuckhtml ->getElementsByClassName( $this->getstyle([ "color" => "#70757a", "font-size" => "13px", "line-height" => "20px" ]), "span" ); $meta_div = explode( "·", $this->fuckhtml ->getTextContent( $meta_div[count($meta_div) - 1] ), 2 ); if(count($meta_div) === 2){ $date = strtotime($meta_div[count($meta_div) - 1]); if($date === false){ $date = null; } } $out["news"][] = [ "title" => $this->titledots( $this->fuckhtml ->getTextContent( $title[0] ) ), "description" => null, "date" => $date, "thumb" => [ "url" => null, "ratio" => null ], "url" => $this->unshiturl( $a[0]["attributes"]["href"] ) ]; } continue; } // // Parse wikipedia heads // $table_div = $this->fuckhtml ->getElementsByTagName( "table" ); if(count($table_div) === 0){ continue; } $this->fuckhtml->load($table_div[0]); // remove table from box $box["innerHTML"] = str_replace( $table_div[0]["outerHTML"], "", $box["innerHTML"] ); // find wiki image $thumb = null; $img = $this->fuckhtml ->getElementsByTagName( "img" ); if(count($img) !== 0){ $thumb = $this->fuckhtml ->getTextContent( $img[0]["attributes"]["src"] ); } $tds = $this->fuckhtml ->getElementsByTagName( "td" ); $description = []; foreach($tds as $td){ // probe for subtitle $this->fuckhtml->load($td); $subtext = $this->fuckhtml ->getElementsByClassName( $this->getstyle([ "color" => "#70757a", "font-size" => "13px", "line-height" => "20px" ]) ); if(count($subtext) !== 0){ $description[] = [ "type" => "quote", "value" => $this->fuckhtml ->getTextContent( $subtext[0] ) ]; break; } } $this->fuckhtml->load($box); // probe for word definition $lists = $this->fuckhtml ->getElementsByTagName( "ol" ); if(count($lists) !== 0){ $description = []; foreach($lists as $list){ $box["innerHTML"] = explode( $list["outerHTML"], $box["innerHTML"], 2 ); if( count($box["innerHTML"]) === 1 || trim($box["innerHTML"][0]) == "" ){ break; } $description[] = [ "type" => "title", "value" => $this->fuckhtml ->getTextContent( $box["innerHTML"][0] ) ]; $this->fuckhtml->load($list); $lis = $this->fuckhtml ->getElementsByTagName( "li" ); $increment = 1; foreach($lis as $li){ $this->fuckhtml->load($li); $list_items = $this->fuckhtml ->getElementsByClassName( $this->getstyle([ "color" => "#202124", "font-size" => "13px", "line-height" => "20px" ]) ); $first_item = true; foreach($list_items as $it){ if($first_item){ $first_item = false; $c = count($description); if( $c !== 0 && $description[$c - 1]["type"] == "text" ){ $description[$c - 1]["value"] .= "\n\n" . $increment . ". " . $this->fuckhtml ->getTextContent( $it ); }else{ $description[] = [ "type" => "text", "value" => $increment . ". " . $this->fuckhtml ->getTextContent( $it ) ]; } }else{ $description[] = [ "type" => "quote", "value" => $this->fuckhtml ->getTextContent( $it ) ]; } $increment++; } } $box["innerHTML"] = $box["innerHTML"][1]; } $out["answer"][] = [ "title" => $wiki_title, "description" => $description, "url" => null, "thumb" => null, "table" => [], "sublink" => [] ]; continue; } // get separator between description and facts $separator = $this->fuckhtml ->getElementsByClassName( $this->getstyle([ "height" => "4px" ]), "div" ); $box_html = []; $table = []; if(count($separator) !== 0){ $box_html = explode( $separator[0]["outerHTML"], $box["innerHTML"], 2 ); if(count($box_html) === 2){ $box["innerHTML"] = $box_html[0]; } $this->fuckhtml->load($box_html[1]); // get all facts $facts = $this->fuckhtml ->getElementsByTagName( "div" ); foreach($facts as $fact){ if($fact["level"] !== 1){ continue; } $fact = explode( ":", $this->fuckhtml ->getTextContent( $fact ) ); $table[trim(preg_replace('/\s+/', " ", $fact[0]))] = trim(preg_replace('/\s+/', " ", $fact[1])); } $this->fuckhtml->load($box); } // remove wikipedia link $wiki_link = $this->fuckhtml ->getElementsByClassName( $this->getstyle([ "color" => "#1967d2" ]), "a" ); $url = null; if(count($wiki_link) !== 0){ foreach($wiki_link as $link){ if( strtolower( $this->fuckhtml ->getTextContent( $link ) ) == "wikipedia" ){ $box["innerHTML"] = str_replace( $link["outerHTML"], "", $box["innerHTML"] ); $url = $this->unshiturl( $link["attributes"]["href"] ); $this->fuckhtml->load($box); break; } } } // remains of box should be description $description[] = [ "type" => "text", "value" => $this->titledots( $this->fuckhtml ->getTextContent( $box ) ) ]; $out["answer"][] = [ "title" => $wiki_title, "description" => $description, "url" => $url, "thumb" => $thumb, "table" => $table, "sublink" => [] ]; } } return $out; } public function video($get){ if($get["npt"]){ [$params, $proxy] = $this->backend->get($get["npt"], "video"); $params = json_decode($params, true); $search = $params["q"]; }else{ $search = $get["s"]; $country = $get["country"]; $nsfw = $get["nsfw"]; $older = $get["older"]; $newer = $get["newer"]; $duration = $get["duration"]; $quality = $get["quality"]; $captions = $get["captions"]; $proxy = $this->backend->get_ip(); $params = [ "q" => $search, "tbm" => "vid", "hl" => "en", "num" => "20" ]; // country if($country != "any"){ $params["gl"] = $country; } // nsfw $params["safe"] = $nsfw == "yes" ? "off" : "active"; $tbs = []; // get date $older = $older === false ? null : date("m/d/Y", $older); $newer = $newer === false ? null : date("m/d/Y", $newer); if( $older !== null || $newer !== null ){ $tbs["cdr"] = "1"; $tbs["cd_min"] = $newer; $tbs["cd_max"] = $older; } // duration if($duration != "any"){ $tbs[] = "dur:" . $duration; } // quality if($quality != "any"){ $tbs[] = "hq:" . $quality; } // captions if($captions != "any"){ $tbs[] = "cc:" . $captions; } // append tbs if(count($tbs) !== 0){ $params["tbs"] = implode(",", $tbs); } } try{ $html = $this->get( $proxy, "https://www.google.com/search", $params ); }catch(Exception $error){ throw new Exception("Failed to get HTML"); } //$html = file_get_contents("scraper/google.html"); $response = $this->parsepage($html, "videos", $search, $proxy, $params); $out = [ "status" => "ok", "npt" => $response["npt"], "video" => [], "author" => [], "livestream" => [], "playlist" => [], "reel" => [] ]; foreach($response["web"] as $result){ $out["video"][] = [ "title" => $result["title"], "description" => $result["description"], "author" => [ "name" => isset($result["table"]["Author"]) ? $result["table"]["Author"] : null, "url" => null, "avatar" => null ], "date" => $result["date"], "duration" => isset($result["table"]["Duration"]) ? $this->hms2int($result["table"]["Duration"]) : null, "views" => null, "thumb" => $result["thumb"], "url" => $result["url"] ]; } return $out; } public function news($get){ if($get["npt"]){ [$req, $proxy] = $this->backend->get($get["npt"], "news"); /*parse_str( parse_url($req, PHP_URL_QUERY), $search );*/ try{ $html = $this->get( $proxy, "https://www.google.com" . $req, [] ); }catch(Exception $error){ throw new Exception("Failed to get HTML"); } }else{ $search = $get["s"]; $country = $get["country"]; $nsfw = $get["nsfw"]; $older = $get["older"]; $newer = $get["newer"]; $sort = $get["sort"]; $proxy = $this->backend->get_ip(); $params = [ "q" => $search, "tbm" => "nws", "hl" => "en", "num" => "20" ]; // country if($country != "any"){ $params["gl"] = $country; } // nsfw $params["safe"] = $nsfw == "yes" ? "off" : "active"; $tbs = []; // get date $older = $older === false ? null : date("m/d/Y", $older); $newer = $newer === false ? null : date("m/d/Y", $newer); if( $older !== null || $newer !== null ){ $tbs["cdr"] = "1"; $tbs["cd_min"] = $newer; $tbs["cd_max"] = $older; } // relevance if($sort == "date"){ $tbs["sbd"] = "1"; } // append tbs if(count($tbs) !== 0){ $params["tbs"] = ""; foreach($tbs as $key => $value){ $params["tbs"] .= $key . ":" . $value . ","; } $params["tbs"] = rtrim($params["tbs"], ","); } //$html = file_get_contents("scraper/google-news.html"); $html = $this->get( $proxy, "https://www.google.com/search", $params ); } $out = [ "status" => "ok", "npt" => null, "news" => [] ]; $this->fuckhtml->load($html); $this->detect_sorry(); // get images $this->scrape_dimg($html); // parse styles $this->parsestyles(); $center_col = $this->fuckhtml ->getElementById( "center_col", "div" ); if($center_col === null){ throw new Exception("Could not grep result div"); } $this->fuckhtml->load($center_col); // get next page $npt = $this->fuckhtml ->getElementById( "pnnext", "a" ); if($npt !== false){ $out["npt"] = $this->backend->store( $this->fuckhtml ->getTextContent( $npt["attributes"] ["href"] ), "news", $proxy ); } $as = $this->fuckhtml ->getElementsByAttributeName( "jsname", "a" ); foreach($as as $a){ $this->fuckhtml->load($a); // get title $title = $this->fuckhtml ->getElementsByAttributeValue( "role", "heading", "div" ); if(count($title) === 0){ continue; } $title = $this->titledots( $this->fuckhtml ->getTextContent( $title[0] ) ); // get thumbnail $image = $this->fuckhtml ->getElementsByAttributeName( "id", "img" ); // check for padded title node, if found, we're inside a carousel $probe = $this->fuckhtml ->getElementsByClassName( $this->getstyle( [ "padding" => "16px 16px 40px 16px" ] ), "div" ); if(count($probe) !== 0){ $probe = true; }else{ $probe = false; } if( count($image) !== 0 && !isset($image[0]["attributes"]["width"]) ){ $thumb = [ "url" => $this->getdimg( $image[0]["attributes"]["id"] ), "ratio" => $probe === true ? "16:9" : "1:1" ]; }else{ $thumb = [ "url" => null, "ratio" => null ]; } $description = null; if($probe === false){ $desc_divs = $this->fuckhtml ->getElementsByAttributeName( "style", "div" ); foreach($desc_divs as $desc){ if( strpos( $desc["attributes"]["style"], "margin-top:" ) !== false ){ $description = $this->titledots( $this->fuckhtml ->getTextContent( $desc ) ); break; } } } // get author $author = $this->fuckhtml ->getElementsByClassName( $this->getstyle( [ "overflow" => "hidden", "text-align" => "left", "text-overflow" => "ellipsis", "white-space" => "nowrap", "margin-bottom" => "8px" ] ), "div" ); if(count($author) !== 0){ $author = $this->fuckhtml ->getTextContent( $author[0] ); }else{ $author = null; } // get date $date = null; $date_div = $this->fuckhtml ->getElementsByAttributeName( "style", "div" ); foreach($date_div as $d){ $this->fuckhtml->load($d); $span = $this->fuckhtml ->getElementsByTagName( "span" ); if( strpos( $d["attributes"]["style"], "bottom:" ) !== false ){ $date = strtotime( $this->fuckhtml ->getTextContent( $span[count($span) - 1] ) ); break; } } $out["news"][] = [ "title" => $title, "author" => $author, "description" => $description, "date" => $date, "thumb" => $thumb, "url" => $this->unshiturl( $a["attributes"] ["href"] ) ]; } return $out; } public function image($get){ // generate parameters if($get["npt"]){ [$params, $proxy] = $this->backend->get( $get["npt"], "images" ); $params = json_decode($params, true); }else{ $search = $get["s"]; if(strlen($search) === 0){ throw new Exception("Search term is empty!"); } $proxy = $this->backend->get_ip(); $country = $get["country"]; $nsfw = $get["nsfw"]; $time = $get["time"]; $size = $get["size"]; $ratio = $get["ratio"]; $color = $get["color"]; $type = $get["type"]; $format = $get["format"]; $rights = $get["rights"]; $params = [ "q" => $search, "udm" => "2" // get images ]; // country (image search uses cr instead of gl) if($country != "any"){ $params["cr"] = "country" . strtoupper($country); } // nsfw $params["safe"] = $nsfw == "yes" ? "off" : "active"; // generate tbs $tbs = []; // time if($time != "any"){ $tbs["qdr"] = $time; } // size if($size != "any"){ $params["imgsz"] = $size; } // ratio if($ratio != "any"){ $params["imgar"] = $ratio; } // color if($color != "any"){ if( $color == "color" || $color == "trans" ){ $params["imgc"] = $color; }elseif($color == "bnw"){ $params["imgc"] = "gray"; }else{ $tbs["ic"] = "specific"; $tbs["isc"] = $color; } } // type if($type != "any"){ $tbs["itp"] = $type; } // format if($format != "any"){ $params["as_filetype"] = $format; } // rights (tbs) if($rights != "any"){ $tbs["sur"] = $rights; } // append tbs if(count($tbs) !== 0){ $params["tbs"] = ""; foreach($tbs as $key => $value){ $params["tbs"] .= $key . ":" . $value . ","; } $params["tbs"] = rtrim($params["tbs"], ","); } } /* $handle = fopen("scraper/google-img.html", "r"); $html = fread($handle, filesize("scraper/google-img.html")); fclose($handle);*/ try{ $html = $this->get( $proxy, "https://www.google.com/search", $params ); }catch(Exception $error){ throw new Exception("Failed to get search page"); } $this->fuckhtml->load($html); $this->detect_sorry(); // get javascript images $this->scrape_imagearr($html); $out = [ "status" => "ok", "npt" => null, "image" => [] ]; $images = $this->fuckhtml ->getElementsByClassName( "ivg-i", "div" ); foreach($images as $div){ $this->fuckhtml->load($div); $image = $this->fuckhtml ->getElementsByTagName("img")[0]; $out["image"][] = [ "title" => $this->titledots( $this->fuckhtml ->getTextContent( $image["attributes"]["alt"] ) ), "source" => $this->image_arr[ $div["attributes"]["data-docid"] ], "url" => $this->fuckhtml ->getTextContent( $div["attributes"]["data-lpage"] ) ]; } // as usual, no way to check if there is a next page reliably if(count($out["image"]) > 50){ if(!isset($params["start"])){ $params["start"] = 10; }else{ $params["start"] += 10; } $out["npt"] = $this->backend ->store( json_encode($params), "image", $proxy ); } return $out; } private function unshiturl($url, $return_size = false){ // decode $url = $this->fuckhtml ->getTextContent( $url ); $url_parts = parse_url($url); if(isset($url_parts["query"])){ parse_str($url_parts["query"], $query); }else{ $query = []; } if( !isset( $url_parts["host"] ) || stripos($url_parts["host"], "google.") !== false ){ // no host, we have a tracking url if(isset($query["imgurl"])){ $url = $query["imgurl"]; } elseif(isset($query["q"])){ $url = $query["q"]; } } // rewrite URLs to remove extra tracking parameters $domain = parse_url($url, PHP_URL_HOST); if( preg_match( '/wikipedia.org$/', $domain ) ){ // rewrite wikipedia mobile URLs to desktop $url = $this->replacedomain( $url, preg_replace( '/([a-z0-9]+)(\.m\.)/', '$1.', $domain ) ); } elseif( preg_match( '/imdb\.com$|youtube\.[^.]+$/', $domain ) ){ // rewrite imdb and youtube mobile URLs too $url = $this->replacedomain( $url, preg_replace( '/^m\./', "", $domain ) ); } elseif( preg_match( '/play\.google\.[^.]+$/', $domain ) ){ // remove referrers from play.google.com $u_query = parse_url($url, PHP_URL_QUERY); if($u_query !== null){ parse_str($u_query, $u_query); if(isset($u_query["referrer"])){ unset($u_query["referrer"]); } if(isset($u_query["hl"])){ unset($u_query["hl"]); } if(isset($u_query["gl"])){ unset($u_query["gl"]); } $query = http_build_query($query); $url = str_replace( $u_query, $u_query, $url ); } } elseif( preg_match( '/twitter\.com$/', $domain ) ){ // remove more referrers from twitter.com $u_query = parse_url($url, PHP_URL_QUERY); if($u_query !== null){ parse_str($u_query, $u_query); if(isset($u_query["ref_src"])){ unset($u_query["ref_src"]); } $u_query = http_build_query($u_query); $url = str_replace( $oldquery, $u_query, $url ); } } elseif( preg_match( '/maps\.google\.[^.]+/', $domain ) ){ if(stripos($url, "maps?") !== false){ $u_query = parse_url($url, PHP_URL_QUERY); if($u_query !== null){ parse_str($u_query, $u_query); if(isset($u_query["daddr"])){ $url = "https://maps.google.com/maps?daddr=" . urlencode($u_query["daddr"]); } } } } if($return_size){ return [ "url" => $url, "ref" => isset($query["imgrefurl"]) ? $query["imgrefurl"] : null, "thumb_width" => isset($query["tbnw"]) ? (int)$query["tbnw"] : null, "thumb_height" => isset($query["tbnh"]) ? (int)$query["tbnh"] : null, "image_width" => isset($query["w"]) ? (int)$query["w"] : null, "image_height" => isset($query["h"]) ? (int)$query["h"] : null ]; } return $url; } private function replacedomain($url, $domain){ return preg_replace( '/(https?:\/\/)([^\/]+)/', '$1' . $domain, $url ); } private function titledots($title){ return trim($title, " .\t\n\r\0\x0B…"); } private function hms2int($time){ $parts = explode(":", $time, 3); $time = 0; if(count($parts) === 3){ // hours $time = $time + ((int)$parts[0] * 3600); array_shift($parts); } if(count($parts) === 2){ // minutes $time = $time + ((int)$parts[0] * 60); array_shift($parts); } // seconds $time = $time + (int)$parts[0]; return $time; } private function detect_sorry(){ $recaptcha = $this->fuckhtml ->getElementById( "recaptcha", "div" ); if($recaptcha !== false){ throw new Exception("Google returned a captcha"); } } }