From a4a44709b4ee1dffaca8b7f79b3c0814914a58f7 Mon Sep 17 00:00:00 2001 From: lolcat Date: Wed, 8 Oct 2025 00:42:36 -0400 Subject: [PATCH] google quote on quote fix --- data/api_keys/google_api.txt | 8 + lib/backend.php | 37 +- lib/frontend.php | 3 +- scraper/google.php | 461 +--------------------- scraper/google_api.php | 739 +++++++++++++++++++++++++++++++++++ settings.php | 4 + 6 files changed, 789 insertions(+), 463 deletions(-) create mode 100644 data/api_keys/google_api.txt create mode 100644 scraper/google_api.php diff --git a/data/api_keys/google_api.txt b/data/api_keys/google_api.txt new file mode 100644 index 0000000..5548d21 --- /dev/null +++ b/data/api_keys/google_api.txt @@ -0,0 +1,8 @@ +# Specify API keys for the Google API in the following format: +# +# +# Generate keys here: +# https://developers.google.com/custom-search/v1/overview +# Make sure to use a different Google account for each key, cause I'm +# pretty sure the ratelimit is on a per-account basis :P +# diff --git a/lib/backend.php b/lib/backend.php index 66e78a1..68ac270 100644 --- a/lib/backend.php +++ b/lib/backend.php @@ -9,7 +9,7 @@ class backend{ /* Proxy stuff */ - public function get_ip(){ + public function get_ip($proxy_index_raw = null){ $pool = constant("config::PROXY_" . strtoupper($this->scraper)); if($pool === false){ @@ -19,7 +19,10 @@ class backend{ } // indent - $proxy_index_raw = apcu_inc("p." . $this->scraper); + if($proxy_index_raw === null){ + + $proxy_index_raw = apcu_inc("p." . $this->scraper); + } $proxylist = file_get_contents("data/proxies/" . $pool . ".txt"); $proxylist = explode("\n", $proxylist); @@ -32,6 +35,12 @@ class backend{ $proxylist = array_values($proxylist); + if(count($proxylist) === 0){ + + throw new Exception("A proxy list was specified but it's empty!"); + } + + //echo $proxylist[$proxy_index_raw % count($proxylist)]; return $proxylist[$proxy_index_raw % count($proxylist)]; } @@ -88,6 +97,30 @@ class backend{ } } + // API key rotation + public function get_key(){ + + $keys = file_get_contents("data/api_keys/" . $this->scraper . ".txt"); + $keys = explode("\n", $keys); + + $keys = array_filter($keys, function($entry){ + $entry = ltrim($entry); + return strlen($entry) > 0 && substr($entry, 0, 1) != "#"; + }); + + $keys = array_values($keys); + + if(count($keys) === 0){ + + throw new Exception("Please specify API keys in data/api_keys/" . $this->scraper . ".txt"); + } + + $increment = apcu_inc("s." . $this->scraper) % count($keys); + return [ + "key" => $keys[$increment], + "increment" => $increment + ]; + } /* diff --git a/lib/frontend.php b/lib/frontend.php index dfa8b0b..eb87df7 100644 --- a/lib/frontend.php +++ b/lib/frontend.php @@ -937,11 +937,12 @@ class frontend{ "display" => "Scraper", "option" => [ "ddg" => "DuckDuckGo", + "yahoo" => "Yahoo!", "brave" => "Brave", "mullvad_brave" => "Mullvad (Brave)", "yandex" => "Yandex", "google" => "Google", - //"google_api" => "Google API", + "google_api" => "Google API", "google_cse" => "Google CSE", "mullvad_google" => "Mullvad (Google)", "startpage" => "Startpage", diff --git a/scraper/google.php b/scraper/google.php index c83b084..049c844 100644 --- a/scraper/google.php +++ b/scraper/google.php @@ -561,466 +561,7 @@ class google{ public function web($get){ - if($get["npt"]){ - - [$params, $proxy] = $this->backend->get($get["npt"], "web"); - - $params = json_decode($params, true); - - try{ - $json = - $this->get( - $proxy, - "https://www.google.com/search", - $params - ); - }catch(Exception $error){ - - throw new Exception("Failed to get HTML"); - } - - }else{ - $search = $get["s"]; - $country = $get["country"]; - $nsfw = $get["nsfw"]; - $lang = $get["lang"]; - $older = $get["older"]; - $newer = $get["newer"]; - $spellcheck = $get["spellcheck"]; - $proxy = $this->backend->get_ip(); - - $offset = 0; - - /* - https://www.google.com/search?udm=14&yv=3&q=asmr&biw=1920&bih=947&start=0&sa=N&asearch=arc&cs=1&async=arc_id:srp_s_110,ffilt:all,ve_name:MoreResultsContainer,use_ac:false,inf:1,_id:arc-srp_s_110,_pms:s,_fmt:pc - - https://www.google.com/search?udm=14& - yv=3& - q=asmr& - biw=1920& - bih=947& - start=0& - sa=N& - asearch=arc& - cs=1& - async=arc_id:srp_s_110,ffilt:all,ve_name:MoreResultsContainer,use_ac:false,inf:1,_id:arc-srp_s_110,_pms:s,_fmt:pc - */ - - $params = [ - "udm" => 14, - "yv" => 3, - "q" => $search, - "biw" => 1920, - "bih" => 947, - "start" => 0, - "sa" => "N", - "asearch" => "arc", - "cs" => 1, - "async" => "arc_id:srp_s_110,ffilt:all,ve_name:MoreResultsContainer,use_ac:false,inf:1,_id:arc-srp_s_110,_pms:s,_fmt:pc", - "hl" => "en", - "num" => 20 - ]; - - // country - if($country != "any"){ - - $params["gl"] = $country; - } - - // nsfw - $params["safe"] = $nsfw == "yes" ? "off" : "active"; - - // language - if($lang != "any"){ - - $params["lr"] = "lang_" . $lang; - } - - // generate tbs - $tbs = []; - - // get date - $older = $older === false ? null : date("m/d/Y", $older); - $newer = $newer === false ? null : date("m/d/Y", $newer); - - if( - $older !== null || - $newer !== null - ){ - - $tbs["cdr"] = "1"; - $tbs["cd_min"] = $newer; - $tbs["cd_max"] = $older; - } - - // spellcheck filter - if($spellcheck == "no"){ - - $params["nfpr"] = "1"; - } - - if(count($tbs) !== 0){ - - $params["tbs"] = ""; - - foreach($tbs as $key => $value){ - - $params["tbs"] .= $key . ":" . $value . ","; - } - - $params["tbs"] = rtrim($params["tbs"], ","); - } - - try{ - $json = - $this->get( - $proxy, - "https://www.google.com/search", - $params - ); - }catch(Exception $error){ - - throw new Exception("Failed to get HTML"); - } - - //$json = file_get_contents("scraper/google.js"); - } - - $out = [ - "status" => "ok", - "spelling" => [ - "type" => "no_correction", - "using" => null, - "correction" => null - ], - "npt" => null, - "answer" => [], - "web" => [], - "image" => [], - "video" => [], - "news" => [], - "related" => [] - ]; - - $this->fuckhtml->load($json); - $this->detect_sorry(); - - // get next page - /* - $npt = - $this->fuckhtml - ->getElementsByAttributeName( - "data-state-token", - "div" - ); - - if(count($npt) !== 0){ - - $params["sstk"] = - $this->fuckhtml - ->getTextContent( - $npt[0]["attributes"]["data-state-token"] - ); - - $params["start"] += 10; - - $out["npt"] = - $this->backend->store( - json_encode($params), - "web", - $proxy - ); - }*/ - - // get invididual results - $results = - $this->fuckhtml - ->getElementsByAttributeName( - "data-hveid", - "div" - ); - - foreach($results as $result){ - - $this->fuckhtml->load($result); - - //echo $result["innerHTML"]; - - $snfs = - $this->fuckhtml - ->getElementsByAttributeName( - "data-snf", - "div" - ); - - $title = null; - $description = null; - $link = null; - $sublinks = []; - $date = null; - $thumb = [ - "ratio" => null, - "url" => null - ]; - $table = []; - - // probe for title - $title_node = - $this->fuckhtml - ->getElementsByTagName( - "h3" - ); - - if(count($title_node) !== 0){ - - // found a title node - $title = - $this->fuckhtml - ->getTextContent( - $title_node[0] - ); - } - - if($title === null){ - - // should not happen - continue; - } - - foreach($snfs as $snf){ - - $this->fuckhtml->load($snf); - - // probe for thumbnail - $thumbnail = - $this->fuckhtml - ->getElementsByAttributeName( - "alt", - "img" - ); - - foreach($thumbnail as $t){ - - if( - isset($t["attributes"]["style"]) && - preg_match( - '/height ?: ?([0-9]+)px/', - $t["attributes"]["style"], - $match - ) && - (int)$match[1] < 40 - ){ - - // found a favicon, ignore - continue; - } - - $thumb = [ - "ratio" => "1:1", - "url" => - $this->fuckhtml - ->getTextContent( - $thumbnail[0]["attributes"]["src"] - ) - ]; - - continue 2; - } - - // probe for description - if($description === null){ - - // probe 1 - if( - isset($snf["attributes"]["data-sncf"]) && - $snf["attributes"]["data-sncf"] == "1,2" - ){ - - $description = - $this->fuckhtml - ->getTextContent( - $snf - ); - continue; - } - - // probe 2 - $desc_probe = - $this->fuckhtml - ->getElementsByAttributeValue( - "style", - "-webkit-line-clamp:2", - "div" - ); - - if(count($desc_probe) !== 0){ - - $description = - $this->fuckhtml - ->getTextContent( - $desc_probe[0] - ); - continue; - } - } - - // probe for links - $links = - $this->fuckhtml - ->getElementsByAttributeName( - "data-sb", - "a" - ); - - if(isset($links[0]["attributes"]["data-ved"])){ - - // found the page link - $link = - $this->fuckhtml - ->getTextContent( - $links[0]["attributes"]["href"] - ); - - continue; - } - - if(count($links) !== 0){ - - // get all sublinks - for($i=0; $i - $this->titledots( - $this->fuckhtml - ->getTextContent( - $links[$i] - ) - ), - "description" => null, - "date" => null, - "url" => - $this->fuckhtml - ->getTextContent( - $links[$i]["attributes"]["href"] - ) - ]; - } - - continue; - } - - // get tabloid-able data - $tabloid = - $this->fuckhtml - ->getElementsByAttributeValue( - "style", - "margin-top:0px", - "div" - ); - - if(count($tabloid) === 0){ - - // try getting instead - $tabloid = - $this->fuckhtml - ->getElementsByTagName( - "cite" - ); - } - - if(count($tabloid) !== 0){ - - // found table - $tabloid = - explode("·", $tabloid[0]["innerHTML"]); - - foreach($tabloid as $tbl){ - - $preg = - $this->fuckhtml - ->getTextContent( - $tbl - ); - - //$table[random_int(0,1000)] = $preg; - - if( - // match price - preg_match( - '/(\p{Sc}[^\p{Sc}]+)/', - $preg, - $match - ) - ){ - - $table["Price"] = trim($match[1]); - } - - if( - // match in stock/delivery - preg_match( - '/(stock|delivery|returns)/i', - $preg, - $match - ) - ){ - - $table[ucfirst($match[1])] = trim($preg, " \t\n\r\0\x0B\xC2\xA0"); - } - } - continue; - } - } - - // extract date from description - $description_split = - explode( - "—", $description, 2 - ); - - if(count($description_split) === 1){ - - $description = $description_split[0]; - }elseif(strlen($description_split[0]) < 17){ - - $date = strtotime($description_split[0]); - - if($date !== false){ - - $description = $description_split[1]; - }else{ - - $date = null; - } - } - - $out["web"][] = [ - "title" => $this->titledots($title), - "description" => $this->titledots($description), - "url" => $link, - "date" => $date, - "type" => "web", - "thumb" => $thumb, - "sublink" => $sublinks, - "table" => $table - ]; - } - - // get next page - if(count($out["web"]) > 5){ - - $params["start"] += 10; - - $out["npt"] = - $this->backend->store( - json_encode($params), - "web", - $proxy - ); - } - - return $out; + throw new Exception("Google made it impossible to scrape web results without a JavaScript runtime. In the meantime, use the Google API or the Google CSE scrapers."); } diff --git a/scraper/google_api.php b/scraper/google_api.php new file mode 100644 index 0000000..899726b --- /dev/null +++ b/scraper/google_api.php @@ -0,0 +1,739 @@ +backend = new backend("google_api"); + } + + public function getfilters($page){ + + $base = [ + "country" => [ // gl= (image: cr=countryAF) + "display" => "Country", + "option" => [ + "any" => "Instance's country", + "af" => "Afghanistan", + "al" => "Albania", + "dz" => "Algeria", + "as" => "American Samoa", + "ad" => "Andorra", + "ao" => "Angola", + "ai" => "Anguilla", + "aq" => "Antarctica", + "ag" => "Antigua and Barbuda", + "ar" => "Argentina", + "am" => "Armenia", + "aw" => "Aruba", + "au" => "Australia", + "at" => "Austria", + "az" => "Azerbaijan", + "bs" => "Bahamas", + "bh" => "Bahrain", + "bd" => "Bangladesh", + "bb" => "Barbados", + "by" => "Belarus", + "be" => "Belgium", + "bz" => "Belize", + "bj" => "Benin", + "bm" => "Bermuda", + "bt" => "Bhutan", + "bo" => "Bolivia", + "ba" => "Bosnia and Herzegovina", + "bw" => "Botswana", + "bv" => "Bouvet Island", + "br" => "Brazil", + "io" => "British Indian Ocean Territory", + "bn" => "Brunei Darussalam", + "bg" => "Bulgaria", + "bf" => "Burkina Faso", + "bi" => "Burundi", + "kh" => "Cambodia", + "cm" => "Cameroon", + "ca" => "Canada", + "cv" => "Cape Verde", + "ky" => "Cayman Islands", + "cf" => "Central African Republic", + "td" => "Chad", + "cl" => "Chile", + "cn" => "China", + "cx" => "Christmas Island", + "cc" => "Cocos (Keeling) Islands", + "co" => "Colombia", + "km" => "Comoros", + "cg" => "Congo", + "cd" => "Congo, the Democratic Republic", + "ck" => "Cook Islands", + "cr" => "Costa Rica", + "ci" => "Cote D'ivoire", + "hr" => "Croatia", + "cu" => "Cuba", + "cy" => "Cyprus", + "cz" => "Czech Republic", + "dk" => "Denmark", + "dj" => "Djibouti", + "dm" => "Dominica", + "do" => "Dominican Republic", + "ec" => "Ecuador", + "eg" => "Egypt", + "sv" => "El Salvador", + "gq" => "Equatorial Guinea", + "er" => "Eritrea", + "ee" => "Estonia", + "et" => "Ethiopia", + "fk" => "Falkland Islands (Malvinas)", + "fo" => "Faroe Islands", + "fj" => "Fiji", + "fi" => "Finland", + "fr" => "France", + "gf" => "French Guiana", + "pf" => "French Polynesia", + "tf" => "French Southern Territories", + "ga" => "Gabon", + "gm" => "Gambia", + "ge" => "Georgia", + "de" => "Germany", + "gh" => "Ghana", + "gi" => "Gibraltar", + "gr" => "Greece", + "gl" => "Greenland", + "gd" => "Grenada", + "gp" => "Guadeloupe", + "gu" => "Guam", + "gt" => "Guatemala", + "gn" => "Guinea", + "gw" => "Guinea-Bissau", + "gy" => "Guyana", + "ht" => "Haiti", + "hm" => "Heard Island and Mcdonald Islands", + "va" => "Holy See (Vatican City State)", + "hn" => "Honduras", + "hk" => "Hong Kong", + "hu" => "Hungary", + "is" => "Iceland", + "in" => "India", + "id" => "Indonesia", + "ir" => "Iran, Islamic Republic", + "iq" => "Iraq", + "ie" => "Ireland", + "il" => "Israel", + "it" => "Italy", + "jm" => "Jamaica", + "jp" => "Japan", + "jo" => "Jordan", + "kz" => "Kazakhstan", + "ke" => "Kenya", + "ki" => "Kiribati", + "kp" => "Korea, Democratic People's Republic", + "kr" => "Korea, Republic", + "kw" => "Kuwait", + "kg" => "Kyrgyzstan", + "la" => "Lao People's Democratic Republic", + "lv" => "Latvia", + "lb" => "Lebanon", + "ls" => "Lesotho", + "lr" => "Liberia", + "ly" => "Libyan Arab Jamahiriya", + "li" => "Liechtenstein", + "lt" => "Lithuania", + "lu" => "Luxembourg", + "mo" => "Macao", + "mk" => "Macedonia, the Former Yugosalv Republic", + "mg" => "Madagascar", + "mw" => "Malawi", + "my" => "Malaysia", + "mv" => "Maldives", + "ml" => "Mali", + "mt" => "Malta", + "mh" => "Marshall Islands", + "mq" => "Martinique", + "mr" => "Mauritania", + "mu" => "Mauritius", + "yt" => "Mayotte", + "mx" => "Mexico", + "fm" => "Micronesia, Federated States", + "md" => "Moldova, Republic", + "mc" => "Monaco", + "mn" => "Mongolia", + "ms" => "Montserrat", + "ma" => "Morocco", + "mz" => "Mozambique", + "mm" => "Myanmar", + "na" => "Namibia", + "nr" => "Nauru", + "np" => "Nepal", + "nl" => "Netherlands", + "an" => "Netherlands Antilles", + "nc" => "New Caledonia", + "nz" => "New Zealand", + "ni" => "Nicaragua", + "ne" => "Niger", + "ng" => "Nigeria", + "nu" => "Niue", + "nf" => "Norfolk Island", + "mp" => "Northern Mariana Islands", + "no" => "Norway", + "om" => "Oman", + "pk" => "Pakistan", + "pw" => "Palau", + "ps" => "Palestinian Territory, Occupied", + "pa" => "Panama", + "pg" => "Papua New Guinea", + "py" => "Paraguay", + "pe" => "Peru", + "ph" => "Philippines", + "pn" => "Pitcairn", + "pl" => "Poland", + "pt" => "Portugal", + "pr" => "Puerto Rico", + "qa" => "Qatar", + "re" => "Reunion", + "ro" => "Romania", + "ru" => "Russian Federation", + "rw" => "Rwanda", + "sh" => "Saint Helena", + "kn" => "Saint Kitts and Nevis", + "lc" => "Saint Lucia", + "pm" => "Saint Pierre and Miquelon", + "vc" => "Saint Vincent and the Grenadines", + "ws" => "Samoa", + "sm" => "San Marino", + "st" => "Sao Tome and Principe", + "sa" => "Saudi Arabia", + "sn" => "Senegal", + "cs" => "Serbia and Montenegro", + "sc" => "Seychelles", + "sl" => "Sierra Leone", + "sg" => "Singapore", + "sk" => "Slovakia", + "si" => "Slovenia", + "sb" => "Solomon Islands", + "so" => "Somalia", + "za" => "South Africa", + "gs" => "South Georgia and the South Sandwich Islands", + "es" => "Spain", + "lk" => "Sri Lanka", + "sd" => "Sudan", + "sr" => "Suriname", + "sj" => "Svalbard and Jan Mayen", + "sz" => "Swaziland", + "se" => "Sweden", + "ch" => "Switzerland", + "sy" => "Syrian Arab Republic", + "tw" => "Taiwan, Province of China", + "tj" => "Tajikistan", + "tz" => "Tanzania, United Republic", + "th" => "Thailand", + "tl" => "Timor-Leste", + "tg" => "Togo", + "tk" => "Tokelau", + "to" => "Tonga", + "tt" => "Trinidad and Tobago", + "tn" => "Tunisia", + "tr" => "Turkey", + "tm" => "Turkmenistan", + "tc" => "Turks and Caicos Islands", + "tv" => "Tuvalu", + "ug" => "Uganda", + "ua" => "Ukraine", + "ae" => "United Arab Emirates", + "uk" => "United Kingdom", + "us" => "United States", + "um" => "United States Minor Outlying Islands", + "uy" => "Uruguay", + "uz" => "Uzbekistan", + "vu" => "Vanuatu", + "ve" => "Venezuela", + "vn" => "Viet Nam", + "vg" => "Virgin Islands, British", + "vi" => "Virgin Islands, U.S.", + "wf" => "Wallis and Futuna", + "eh" => "Western Sahara", + "ye" => "Yemen", + "zm" => "Zambia", + "zw" => "Zimbabwe" + ] + ], + "nsfw" => [ + "display" => "NSFW", + "option" => [ + "yes" => "Yes", // safe=active + "no" => "No" // safe=off + ] + ] + ]; + + switch($page){ + + case "web": + return array_merge( + $base, + [ + "lang" => [ // lr= (prefix lang with "lang_") + "display" => "Language", + "option" => [ + "any" => "Any language", + "ar" => "Arabic", + "bg" => "Bulgarian", + "ca" => "Catalan", + "cs" => "Czech", + "da" => "Danish", + "de" => "German", + "el" => "Greek", + "en" => "English", + "es" => "Spanish", + "et" => "Estonian", + "fi" => "Finnish", + "fr" => "French", + "hr" => "Croatian", + "hu" => "Hungarian", + "id" => "Indonesian", + "is" => "Icelandic", + "it" => "Italian", + "iw" => "Hebrew", + "ja" => "Japanese", + "ko" => "Korean", + "lt" => "Lithuanian", + "lv" => "Latvian", + "nl" => "Dutch", + "no" => "Norwegian", + "pl" => "Polish", + "pt" => "Portuguese", + "ro" => "Romanian", + "ru" => "Russian", + "sk" => "Slovak", + "sl" => "Slovenian", + "sr" => "Serbian", + "sv" => "Swedish", + "tr" => "Turkish", + "zh-CN" => "Chinese (Simplified)", + "zh-TW" => "Chinese (Traditional)" + ] + ], + "sort" => [ + "display" => "Sort by", + "option" => [ + "any" => "Any order", + "date:d" => "Oldest", + "date:a" => "Newest" + ] + ], + "newer" => [ + "display" => "Newer than", + "option" => "_DATE" + ], + "rm_dupes" => [ + "display" => "Remove duplicates", + "option" => [ + "yes" => "Yes", + "no" => "No" + ] + ] + ] + ); + break; + /* + case "images": + return array_merge( + $base, + [ + "time" => [ // tbs=qdr: