google quote on quote fix
This commit is contained in:
8
data/api_keys/google_api.txt
Normal file
8
data/api_keys/google_api.txt
Normal file
@@ -0,0 +1,8 @@
|
||||
# Specify API keys for the Google API in the following format:
|
||||
# <key>
|
||||
#
|
||||
# Generate keys here:
|
||||
# https://developers.google.com/custom-search/v1/overview
|
||||
# Make sure to use a different Google account for each key, cause I'm
|
||||
# pretty sure the ratelimit is on a per-account basis :P
|
||||
#
|
@@ -9,7 +9,7 @@ class backend{
|
||||
/*
|
||||
Proxy stuff
|
||||
*/
|
||||
public function get_ip(){
|
||||
public function get_ip($proxy_index_raw = null){
|
||||
|
||||
$pool = constant("config::PROXY_" . strtoupper($this->scraper));
|
||||
if($pool === false){
|
||||
@@ -19,7 +19,10 @@ class backend{
|
||||
}
|
||||
|
||||
// indent
|
||||
if($proxy_index_raw === null){
|
||||
|
||||
$proxy_index_raw = apcu_inc("p." . $this->scraper);
|
||||
}
|
||||
|
||||
$proxylist = file_get_contents("data/proxies/" . $pool . ".txt");
|
||||
$proxylist = explode("\n", $proxylist);
|
||||
@@ -32,6 +35,12 @@ class backend{
|
||||
|
||||
$proxylist = array_values($proxylist);
|
||||
|
||||
if(count($proxylist) === 0){
|
||||
|
||||
throw new Exception("A proxy list was specified but it's empty!");
|
||||
}
|
||||
|
||||
//echo $proxylist[$proxy_index_raw % count($proxylist)];
|
||||
return $proxylist[$proxy_index_raw % count($proxylist)];
|
||||
}
|
||||
|
||||
@@ -88,6 +97,30 @@ class backend{
|
||||
}
|
||||
}
|
||||
|
||||
// API key rotation
|
||||
public function get_key(){
|
||||
|
||||
$keys = file_get_contents("data/api_keys/" . $this->scraper . ".txt");
|
||||
$keys = explode("\n", $keys);
|
||||
|
||||
$keys = array_filter($keys, function($entry){
|
||||
$entry = ltrim($entry);
|
||||
return strlen($entry) > 0 && substr($entry, 0, 1) != "#";
|
||||
});
|
||||
|
||||
$keys = array_values($keys);
|
||||
|
||||
if(count($keys) === 0){
|
||||
|
||||
throw new Exception("Please specify API keys in data/api_keys/" . $this->scraper . ".txt");
|
||||
}
|
||||
|
||||
$increment = apcu_inc("s." . $this->scraper) % count($keys);
|
||||
return [
|
||||
"key" => $keys[$increment],
|
||||
"increment" => $increment
|
||||
];
|
||||
}
|
||||
|
||||
|
||||
/*
|
||||
|
@@ -937,11 +937,12 @@ class frontend{
|
||||
"display" => "Scraper",
|
||||
"option" => [
|
||||
"ddg" => "DuckDuckGo",
|
||||
"yahoo" => "Yahoo!",
|
||||
"brave" => "Brave",
|
||||
"mullvad_brave" => "Mullvad (Brave)",
|
||||
"yandex" => "Yandex",
|
||||
"google" => "Google",
|
||||
//"google_api" => "Google API",
|
||||
"google_api" => "Google API",
|
||||
"google_cse" => "Google CSE",
|
||||
"mullvad_google" => "Mullvad (Google)",
|
||||
"startpage" => "Startpage",
|
||||
|
@@ -561,466 +561,7 @@ class google{
|
||||
|
||||
public function web($get){
|
||||
|
||||
if($get["npt"]){
|
||||
|
||||
[$params, $proxy] = $this->backend->get($get["npt"], "web");
|
||||
|
||||
$params = json_decode($params, true);
|
||||
|
||||
try{
|
||||
$json =
|
||||
$this->get(
|
||||
$proxy,
|
||||
"https://www.google.com/search",
|
||||
$params
|
||||
);
|
||||
}catch(Exception $error){
|
||||
|
||||
throw new Exception("Failed to get HTML");
|
||||
}
|
||||
|
||||
}else{
|
||||
$search = $get["s"];
|
||||
$country = $get["country"];
|
||||
$nsfw = $get["nsfw"];
|
||||
$lang = $get["lang"];
|
||||
$older = $get["older"];
|
||||
$newer = $get["newer"];
|
||||
$spellcheck = $get["spellcheck"];
|
||||
$proxy = $this->backend->get_ip();
|
||||
|
||||
$offset = 0;
|
||||
|
||||
/*
|
||||
https://www.google.com/search?udm=14&yv=3&q=asmr&biw=1920&bih=947&start=0&sa=N&asearch=arc&cs=1&async=arc_id:srp_s_110,ffilt:all,ve_name:MoreResultsContainer,use_ac:false,inf:1,_id:arc-srp_s_110,_pms:s,_fmt:pc
|
||||
|
||||
https://www.google.com/search?udm=14&
|
||||
yv=3&
|
||||
q=asmr&
|
||||
biw=1920&
|
||||
bih=947&
|
||||
start=0&
|
||||
sa=N&
|
||||
asearch=arc&
|
||||
cs=1&
|
||||
async=arc_id:srp_s_110,ffilt:all,ve_name:MoreResultsContainer,use_ac:false,inf:1,_id:arc-srp_s_110,_pms:s,_fmt:pc
|
||||
*/
|
||||
|
||||
$params = [
|
||||
"udm" => 14,
|
||||
"yv" => 3,
|
||||
"q" => $search,
|
||||
"biw" => 1920,
|
||||
"bih" => 947,
|
||||
"start" => 0,
|
||||
"sa" => "N",
|
||||
"asearch" => "arc",
|
||||
"cs" => 1,
|
||||
"async" => "arc_id:srp_s_110,ffilt:all,ve_name:MoreResultsContainer,use_ac:false,inf:1,_id:arc-srp_s_110,_pms:s,_fmt:pc",
|
||||
"hl" => "en",
|
||||
"num" => 20
|
||||
];
|
||||
|
||||
// country
|
||||
if($country != "any"){
|
||||
|
||||
$params["gl"] = $country;
|
||||
}
|
||||
|
||||
// nsfw
|
||||
$params["safe"] = $nsfw == "yes" ? "off" : "active";
|
||||
|
||||
// language
|
||||
if($lang != "any"){
|
||||
|
||||
$params["lr"] = "lang_" . $lang;
|
||||
}
|
||||
|
||||
// generate tbs
|
||||
$tbs = [];
|
||||
|
||||
// get date
|
||||
$older = $older === false ? null : date("m/d/Y", $older);
|
||||
$newer = $newer === false ? null : date("m/d/Y", $newer);
|
||||
|
||||
if(
|
||||
$older !== null ||
|
||||
$newer !== null
|
||||
){
|
||||
|
||||
$tbs["cdr"] = "1";
|
||||
$tbs["cd_min"] = $newer;
|
||||
$tbs["cd_max"] = $older;
|
||||
}
|
||||
|
||||
// spellcheck filter
|
||||
if($spellcheck == "no"){
|
||||
|
||||
$params["nfpr"] = "1";
|
||||
}
|
||||
|
||||
if(count($tbs) !== 0){
|
||||
|
||||
$params["tbs"] = "";
|
||||
|
||||
foreach($tbs as $key => $value){
|
||||
|
||||
$params["tbs"] .= $key . ":" . $value . ",";
|
||||
}
|
||||
|
||||
$params["tbs"] = rtrim($params["tbs"], ",");
|
||||
}
|
||||
|
||||
try{
|
||||
$json =
|
||||
$this->get(
|
||||
$proxy,
|
||||
"https://www.google.com/search",
|
||||
$params
|
||||
);
|
||||
}catch(Exception $error){
|
||||
|
||||
throw new Exception("Failed to get HTML");
|
||||
}
|
||||
|
||||
//$json = file_get_contents("scraper/google.js");
|
||||
}
|
||||
|
||||
$out = [
|
||||
"status" => "ok",
|
||||
"spelling" => [
|
||||
"type" => "no_correction",
|
||||
"using" => null,
|
||||
"correction" => null
|
||||
],
|
||||
"npt" => null,
|
||||
"answer" => [],
|
||||
"web" => [],
|
||||
"image" => [],
|
||||
"video" => [],
|
||||
"news" => [],
|
||||
"related" => []
|
||||
];
|
||||
|
||||
$this->fuckhtml->load($json);
|
||||
$this->detect_sorry();
|
||||
|
||||
// get next page
|
||||
/*
|
||||
$npt =
|
||||
$this->fuckhtml
|
||||
->getElementsByAttributeName(
|
||||
"data-state-token",
|
||||
"div"
|
||||
);
|
||||
|
||||
if(count($npt) !== 0){
|
||||
|
||||
$params["sstk"] =
|
||||
$this->fuckhtml
|
||||
->getTextContent(
|
||||
$npt[0]["attributes"]["data-state-token"]
|
||||
);
|
||||
|
||||
$params["start"] += 10;
|
||||
|
||||
$out["npt"] =
|
||||
$this->backend->store(
|
||||
json_encode($params),
|
||||
"web",
|
||||
$proxy
|
||||
);
|
||||
}*/
|
||||
|
||||
// get invididual results
|
||||
$results =
|
||||
$this->fuckhtml
|
||||
->getElementsByAttributeName(
|
||||
"data-hveid",
|
||||
"div"
|
||||
);
|
||||
|
||||
foreach($results as $result){
|
||||
|
||||
$this->fuckhtml->load($result);
|
||||
|
||||
//echo $result["innerHTML"];
|
||||
|
||||
$snfs =
|
||||
$this->fuckhtml
|
||||
->getElementsByAttributeName(
|
||||
"data-snf",
|
||||
"div"
|
||||
);
|
||||
|
||||
$title = null;
|
||||
$description = null;
|
||||
$link = null;
|
||||
$sublinks = [];
|
||||
$date = null;
|
||||
$thumb = [
|
||||
"ratio" => null,
|
||||
"url" => null
|
||||
];
|
||||
$table = [];
|
||||
|
||||
// probe for title
|
||||
$title_node =
|
||||
$this->fuckhtml
|
||||
->getElementsByTagName(
|
||||
"h3"
|
||||
);
|
||||
|
||||
if(count($title_node) !== 0){
|
||||
|
||||
// found a title node
|
||||
$title =
|
||||
$this->fuckhtml
|
||||
->getTextContent(
|
||||
$title_node[0]
|
||||
);
|
||||
}
|
||||
|
||||
if($title === null){
|
||||
|
||||
// should not happen
|
||||
continue;
|
||||
}
|
||||
|
||||
foreach($snfs as $snf){
|
||||
|
||||
$this->fuckhtml->load($snf);
|
||||
|
||||
// probe for thumbnail
|
||||
$thumbnail =
|
||||
$this->fuckhtml
|
||||
->getElementsByAttributeName(
|
||||
"alt",
|
||||
"img"
|
||||
);
|
||||
|
||||
foreach($thumbnail as $t){
|
||||
|
||||
if(
|
||||
isset($t["attributes"]["style"]) &&
|
||||
preg_match(
|
||||
'/height ?: ?([0-9]+)px/',
|
||||
$t["attributes"]["style"],
|
||||
$match
|
||||
) &&
|
||||
(int)$match[1] < 40
|
||||
){
|
||||
|
||||
// found a favicon, ignore
|
||||
continue;
|
||||
}
|
||||
|
||||
$thumb = [
|
||||
"ratio" => "1:1",
|
||||
"url" =>
|
||||
$this->fuckhtml
|
||||
->getTextContent(
|
||||
$thumbnail[0]["attributes"]["src"]
|
||||
)
|
||||
];
|
||||
|
||||
continue 2;
|
||||
}
|
||||
|
||||
// probe for description
|
||||
if($description === null){
|
||||
|
||||
// probe 1
|
||||
if(
|
||||
isset($snf["attributes"]["data-sncf"]) &&
|
||||
$snf["attributes"]["data-sncf"] == "1,2"
|
||||
){
|
||||
|
||||
$description =
|
||||
$this->fuckhtml
|
||||
->getTextContent(
|
||||
$snf
|
||||
);
|
||||
continue;
|
||||
}
|
||||
|
||||
// probe 2
|
||||
$desc_probe =
|
||||
$this->fuckhtml
|
||||
->getElementsByAttributeValue(
|
||||
"style",
|
||||
"-webkit-line-clamp:2",
|
||||
"div"
|
||||
);
|
||||
|
||||
if(count($desc_probe) !== 0){
|
||||
|
||||
$description =
|
||||
$this->fuckhtml
|
||||
->getTextContent(
|
||||
$desc_probe[0]
|
||||
);
|
||||
continue;
|
||||
}
|
||||
}
|
||||
|
||||
// probe for links
|
||||
$links =
|
||||
$this->fuckhtml
|
||||
->getElementsByAttributeName(
|
||||
"data-sb",
|
||||
"a"
|
||||
);
|
||||
|
||||
if(isset($links[0]["attributes"]["data-ved"])){
|
||||
|
||||
// found the page link
|
||||
$link =
|
||||
$this->fuckhtml
|
||||
->getTextContent(
|
||||
$links[0]["attributes"]["href"]
|
||||
);
|
||||
|
||||
continue;
|
||||
}
|
||||
|
||||
if(count($links) !== 0){
|
||||
|
||||
// get all sublinks
|
||||
for($i=0; $i<count($links); $i++){
|
||||
|
||||
$sublinks[] = [
|
||||
"title" =>
|
||||
$this->titledots(
|
||||
$this->fuckhtml
|
||||
->getTextContent(
|
||||
$links[$i]
|
||||
)
|
||||
),
|
||||
"description" => null,
|
||||
"date" => null,
|
||||
"url" =>
|
||||
$this->fuckhtml
|
||||
->getTextContent(
|
||||
$links[$i]["attributes"]["href"]
|
||||
)
|
||||
];
|
||||
}
|
||||
|
||||
continue;
|
||||
}
|
||||
|
||||
// get tabloid-able data
|
||||
$tabloid =
|
||||
$this->fuckhtml
|
||||
->getElementsByAttributeValue(
|
||||
"style",
|
||||
"margin-top:0px",
|
||||
"div"
|
||||
);
|
||||
|
||||
if(count($tabloid) === 0){
|
||||
|
||||
// try getting <cite> instead
|
||||
$tabloid =
|
||||
$this->fuckhtml
|
||||
->getElementsByTagName(
|
||||
"cite"
|
||||
);
|
||||
}
|
||||
|
||||
if(count($tabloid) !== 0){
|
||||
|
||||
// found table
|
||||
$tabloid =
|
||||
explode("·", $tabloid[0]["innerHTML"]);
|
||||
|
||||
foreach($tabloid as $tbl){
|
||||
|
||||
$preg =
|
||||
$this->fuckhtml
|
||||
->getTextContent(
|
||||
$tbl
|
||||
);
|
||||
|
||||
//$table[random_int(0,1000)] = $preg;
|
||||
|
||||
if(
|
||||
// match price
|
||||
preg_match(
|
||||
'/(\p{Sc}[^\p{Sc}]+)/',
|
||||
$preg,
|
||||
$match
|
||||
)
|
||||
){
|
||||
|
||||
$table["Price"] = trim($match[1]);
|
||||
}
|
||||
|
||||
if(
|
||||
// match in stock/delivery
|
||||
preg_match(
|
||||
'/(stock|delivery|returns)/i',
|
||||
$preg,
|
||||
$match
|
||||
)
|
||||
){
|
||||
|
||||
$table[ucfirst($match[1])] = trim($preg, " \t\n\r\0\x0B\xC2\xA0");
|
||||
}
|
||||
}
|
||||
continue;
|
||||
}
|
||||
}
|
||||
|
||||
// extract date from description
|
||||
$description_split =
|
||||
explode(
|
||||
"—", $description, 2
|
||||
);
|
||||
|
||||
if(count($description_split) === 1){
|
||||
|
||||
$description = $description_split[0];
|
||||
}elseif(strlen($description_split[0]) < 17){
|
||||
|
||||
$date = strtotime($description_split[0]);
|
||||
|
||||
if($date !== false){
|
||||
|
||||
$description = $description_split[1];
|
||||
}else{
|
||||
|
||||
$date = null;
|
||||
}
|
||||
}
|
||||
|
||||
$out["web"][] = [
|
||||
"title" => $this->titledots($title),
|
||||
"description" => $this->titledots($description),
|
||||
"url" => $link,
|
||||
"date" => $date,
|
||||
"type" => "web",
|
||||
"thumb" => $thumb,
|
||||
"sublink" => $sublinks,
|
||||
"table" => $table
|
||||
];
|
||||
}
|
||||
|
||||
// get next page
|
||||
if(count($out["web"]) > 5){
|
||||
|
||||
$params["start"] += 10;
|
||||
|
||||
$out["npt"] =
|
||||
$this->backend->store(
|
||||
json_encode($params),
|
||||
"web",
|
||||
$proxy
|
||||
);
|
||||
}
|
||||
|
||||
return $out;
|
||||
throw new Exception("Google made it impossible to scrape web results without a JavaScript runtime. In the meantime, use the Google API or the Google CSE scrapers.");
|
||||
}
|
||||
|
||||
|
||||
|
739
scraper/google_api.php
Normal file
739
scraper/google_api.php
Normal file
@@ -0,0 +1,739 @@
|
||||
<?php
|
||||
|
||||
// @TODO check for consent.google.com page, if need be
|
||||
|
||||
class google_api{
|
||||
|
||||
public function __construct(){
|
||||
|
||||
include "lib/backend.php";
|
||||
$this->backend = new backend("google_api");
|
||||
}
|
||||
|
||||
public function getfilters($page){
|
||||
|
||||
$base = [
|
||||
"country" => [ // gl=<country> (image: cr=countryAF)
|
||||
"display" => "Country",
|
||||
"option" => [
|
||||
"any" => "Instance's country",
|
||||
"af" => "Afghanistan",
|
||||
"al" => "Albania",
|
||||
"dz" => "Algeria",
|
||||
"as" => "American Samoa",
|
||||
"ad" => "Andorra",
|
||||
"ao" => "Angola",
|
||||
"ai" => "Anguilla",
|
||||
"aq" => "Antarctica",
|
||||
"ag" => "Antigua and Barbuda",
|
||||
"ar" => "Argentina",
|
||||
"am" => "Armenia",
|
||||
"aw" => "Aruba",
|
||||
"au" => "Australia",
|
||||
"at" => "Austria",
|
||||
"az" => "Azerbaijan",
|
||||
"bs" => "Bahamas",
|
||||
"bh" => "Bahrain",
|
||||
"bd" => "Bangladesh",
|
||||
"bb" => "Barbados",
|
||||
"by" => "Belarus",
|
||||
"be" => "Belgium",
|
||||
"bz" => "Belize",
|
||||
"bj" => "Benin",
|
||||
"bm" => "Bermuda",
|
||||
"bt" => "Bhutan",
|
||||
"bo" => "Bolivia",
|
||||
"ba" => "Bosnia and Herzegovina",
|
||||
"bw" => "Botswana",
|
||||
"bv" => "Bouvet Island",
|
||||
"br" => "Brazil",
|
||||
"io" => "British Indian Ocean Territory",
|
||||
"bn" => "Brunei Darussalam",
|
||||
"bg" => "Bulgaria",
|
||||
"bf" => "Burkina Faso",
|
||||
"bi" => "Burundi",
|
||||
"kh" => "Cambodia",
|
||||
"cm" => "Cameroon",
|
||||
"ca" => "Canada",
|
||||
"cv" => "Cape Verde",
|
||||
"ky" => "Cayman Islands",
|
||||
"cf" => "Central African Republic",
|
||||
"td" => "Chad",
|
||||
"cl" => "Chile",
|
||||
"cn" => "China",
|
||||
"cx" => "Christmas Island",
|
||||
"cc" => "Cocos (Keeling) Islands",
|
||||
"co" => "Colombia",
|
||||
"km" => "Comoros",
|
||||
"cg" => "Congo",
|
||||
"cd" => "Congo, the Democratic Republic",
|
||||
"ck" => "Cook Islands",
|
||||
"cr" => "Costa Rica",
|
||||
"ci" => "Cote D'ivoire",
|
||||
"hr" => "Croatia",
|
||||
"cu" => "Cuba",
|
||||
"cy" => "Cyprus",
|
||||
"cz" => "Czech Republic",
|
||||
"dk" => "Denmark",
|
||||
"dj" => "Djibouti",
|
||||
"dm" => "Dominica",
|
||||
"do" => "Dominican Republic",
|
||||
"ec" => "Ecuador",
|
||||
"eg" => "Egypt",
|
||||
"sv" => "El Salvador",
|
||||
"gq" => "Equatorial Guinea",
|
||||
"er" => "Eritrea",
|
||||
"ee" => "Estonia",
|
||||
"et" => "Ethiopia",
|
||||
"fk" => "Falkland Islands (Malvinas)",
|
||||
"fo" => "Faroe Islands",
|
||||
"fj" => "Fiji",
|
||||
"fi" => "Finland",
|
||||
"fr" => "France",
|
||||
"gf" => "French Guiana",
|
||||
"pf" => "French Polynesia",
|
||||
"tf" => "French Southern Territories",
|
||||
"ga" => "Gabon",
|
||||
"gm" => "Gambia",
|
||||
"ge" => "Georgia",
|
||||
"de" => "Germany",
|
||||
"gh" => "Ghana",
|
||||
"gi" => "Gibraltar",
|
||||
"gr" => "Greece",
|
||||
"gl" => "Greenland",
|
||||
"gd" => "Grenada",
|
||||
"gp" => "Guadeloupe",
|
||||
"gu" => "Guam",
|
||||
"gt" => "Guatemala",
|
||||
"gn" => "Guinea",
|
||||
"gw" => "Guinea-Bissau",
|
||||
"gy" => "Guyana",
|
||||
"ht" => "Haiti",
|
||||
"hm" => "Heard Island and Mcdonald Islands",
|
||||
"va" => "Holy See (Vatican City State)",
|
||||
"hn" => "Honduras",
|
||||
"hk" => "Hong Kong",
|
||||
"hu" => "Hungary",
|
||||
"is" => "Iceland",
|
||||
"in" => "India",
|
||||
"id" => "Indonesia",
|
||||
"ir" => "Iran, Islamic Republic",
|
||||
"iq" => "Iraq",
|
||||
"ie" => "Ireland",
|
||||
"il" => "Israel",
|
||||
"it" => "Italy",
|
||||
"jm" => "Jamaica",
|
||||
"jp" => "Japan",
|
||||
"jo" => "Jordan",
|
||||
"kz" => "Kazakhstan",
|
||||
"ke" => "Kenya",
|
||||
"ki" => "Kiribati",
|
||||
"kp" => "Korea, Democratic People's Republic",
|
||||
"kr" => "Korea, Republic",
|
||||
"kw" => "Kuwait",
|
||||
"kg" => "Kyrgyzstan",
|
||||
"la" => "Lao People's Democratic Republic",
|
||||
"lv" => "Latvia",
|
||||
"lb" => "Lebanon",
|
||||
"ls" => "Lesotho",
|
||||
"lr" => "Liberia",
|
||||
"ly" => "Libyan Arab Jamahiriya",
|
||||
"li" => "Liechtenstein",
|
||||
"lt" => "Lithuania",
|
||||
"lu" => "Luxembourg",
|
||||
"mo" => "Macao",
|
||||
"mk" => "Macedonia, the Former Yugosalv Republic",
|
||||
"mg" => "Madagascar",
|
||||
"mw" => "Malawi",
|
||||
"my" => "Malaysia",
|
||||
"mv" => "Maldives",
|
||||
"ml" => "Mali",
|
||||
"mt" => "Malta",
|
||||
"mh" => "Marshall Islands",
|
||||
"mq" => "Martinique",
|
||||
"mr" => "Mauritania",
|
||||
"mu" => "Mauritius",
|
||||
"yt" => "Mayotte",
|
||||
"mx" => "Mexico",
|
||||
"fm" => "Micronesia, Federated States",
|
||||
"md" => "Moldova, Republic",
|
||||
"mc" => "Monaco",
|
||||
"mn" => "Mongolia",
|
||||
"ms" => "Montserrat",
|
||||
"ma" => "Morocco",
|
||||
"mz" => "Mozambique",
|
||||
"mm" => "Myanmar",
|
||||
"na" => "Namibia",
|
||||
"nr" => "Nauru",
|
||||
"np" => "Nepal",
|
||||
"nl" => "Netherlands",
|
||||
"an" => "Netherlands Antilles",
|
||||
"nc" => "New Caledonia",
|
||||
"nz" => "New Zealand",
|
||||
"ni" => "Nicaragua",
|
||||
"ne" => "Niger",
|
||||
"ng" => "Nigeria",
|
||||
"nu" => "Niue",
|
||||
"nf" => "Norfolk Island",
|
||||
"mp" => "Northern Mariana Islands",
|
||||
"no" => "Norway",
|
||||
"om" => "Oman",
|
||||
"pk" => "Pakistan",
|
||||
"pw" => "Palau",
|
||||
"ps" => "Palestinian Territory, Occupied",
|
||||
"pa" => "Panama",
|
||||
"pg" => "Papua New Guinea",
|
||||
"py" => "Paraguay",
|
||||
"pe" => "Peru",
|
||||
"ph" => "Philippines",
|
||||
"pn" => "Pitcairn",
|
||||
"pl" => "Poland",
|
||||
"pt" => "Portugal",
|
||||
"pr" => "Puerto Rico",
|
||||
"qa" => "Qatar",
|
||||
"re" => "Reunion",
|
||||
"ro" => "Romania",
|
||||
"ru" => "Russian Federation",
|
||||
"rw" => "Rwanda",
|
||||
"sh" => "Saint Helena",
|
||||
"kn" => "Saint Kitts and Nevis",
|
||||
"lc" => "Saint Lucia",
|
||||
"pm" => "Saint Pierre and Miquelon",
|
||||
"vc" => "Saint Vincent and the Grenadines",
|
||||
"ws" => "Samoa",
|
||||
"sm" => "San Marino",
|
||||
"st" => "Sao Tome and Principe",
|
||||
"sa" => "Saudi Arabia",
|
||||
"sn" => "Senegal",
|
||||
"cs" => "Serbia and Montenegro",
|
||||
"sc" => "Seychelles",
|
||||
"sl" => "Sierra Leone",
|
||||
"sg" => "Singapore",
|
||||
"sk" => "Slovakia",
|
||||
"si" => "Slovenia",
|
||||
"sb" => "Solomon Islands",
|
||||
"so" => "Somalia",
|
||||
"za" => "South Africa",
|
||||
"gs" => "South Georgia and the South Sandwich Islands",
|
||||
"es" => "Spain",
|
||||
"lk" => "Sri Lanka",
|
||||
"sd" => "Sudan",
|
||||
"sr" => "Suriname",
|
||||
"sj" => "Svalbard and Jan Mayen",
|
||||
"sz" => "Swaziland",
|
||||
"se" => "Sweden",
|
||||
"ch" => "Switzerland",
|
||||
"sy" => "Syrian Arab Republic",
|
||||
"tw" => "Taiwan, Province of China",
|
||||
"tj" => "Tajikistan",
|
||||
"tz" => "Tanzania, United Republic",
|
||||
"th" => "Thailand",
|
||||
"tl" => "Timor-Leste",
|
||||
"tg" => "Togo",
|
||||
"tk" => "Tokelau",
|
||||
"to" => "Tonga",
|
||||
"tt" => "Trinidad and Tobago",
|
||||
"tn" => "Tunisia",
|
||||
"tr" => "Turkey",
|
||||
"tm" => "Turkmenistan",
|
||||
"tc" => "Turks and Caicos Islands",
|
||||
"tv" => "Tuvalu",
|
||||
"ug" => "Uganda",
|
||||
"ua" => "Ukraine",
|
||||
"ae" => "United Arab Emirates",
|
||||
"uk" => "United Kingdom",
|
||||
"us" => "United States",
|
||||
"um" => "United States Minor Outlying Islands",
|
||||
"uy" => "Uruguay",
|
||||
"uz" => "Uzbekistan",
|
||||
"vu" => "Vanuatu",
|
||||
"ve" => "Venezuela",
|
||||
"vn" => "Viet Nam",
|
||||
"vg" => "Virgin Islands, British",
|
||||
"vi" => "Virgin Islands, U.S.",
|
||||
"wf" => "Wallis and Futuna",
|
||||
"eh" => "Western Sahara",
|
||||
"ye" => "Yemen",
|
||||
"zm" => "Zambia",
|
||||
"zw" => "Zimbabwe"
|
||||
]
|
||||
],
|
||||
"nsfw" => [
|
||||
"display" => "NSFW",
|
||||
"option" => [
|
||||
"yes" => "Yes", // safe=active
|
||||
"no" => "No" // safe=off
|
||||
]
|
||||
]
|
||||
];
|
||||
|
||||
switch($page){
|
||||
|
||||
case "web":
|
||||
return array_merge(
|
||||
$base,
|
||||
[
|
||||
"lang" => [ // lr=<lang> (prefix lang with "lang_")
|
||||
"display" => "Language",
|
||||
"option" => [
|
||||
"any" => "Any language",
|
||||
"ar" => "Arabic",
|
||||
"bg" => "Bulgarian",
|
||||
"ca" => "Catalan",
|
||||
"cs" => "Czech",
|
||||
"da" => "Danish",
|
||||
"de" => "German",
|
||||
"el" => "Greek",
|
||||
"en" => "English",
|
||||
"es" => "Spanish",
|
||||
"et" => "Estonian",
|
||||
"fi" => "Finnish",
|
||||
"fr" => "French",
|
||||
"hr" => "Croatian",
|
||||
"hu" => "Hungarian",
|
||||
"id" => "Indonesian",
|
||||
"is" => "Icelandic",
|
||||
"it" => "Italian",
|
||||
"iw" => "Hebrew",
|
||||
"ja" => "Japanese",
|
||||
"ko" => "Korean",
|
||||
"lt" => "Lithuanian",
|
||||
"lv" => "Latvian",
|
||||
"nl" => "Dutch",
|
||||
"no" => "Norwegian",
|
||||
"pl" => "Polish",
|
||||
"pt" => "Portuguese",
|
||||
"ro" => "Romanian",
|
||||
"ru" => "Russian",
|
||||
"sk" => "Slovak",
|
||||
"sl" => "Slovenian",
|
||||
"sr" => "Serbian",
|
||||
"sv" => "Swedish",
|
||||
"tr" => "Turkish",
|
||||
"zh-CN" => "Chinese (Simplified)",
|
||||
"zh-TW" => "Chinese (Traditional)"
|
||||
]
|
||||
],
|
||||
"sort" => [
|
||||
"display" => "Sort by",
|
||||
"option" => [
|
||||
"any" => "Any order",
|
||||
"date:d" => "Oldest",
|
||||
"date:a" => "Newest"
|
||||
]
|
||||
],
|
||||
"newer" => [
|
||||
"display" => "Newer than",
|
||||
"option" => "_DATE"
|
||||
],
|
||||
"rm_dupes" => [
|
||||
"display" => "Remove duplicates",
|
||||
"option" => [
|
||||
"yes" => "Yes",
|
||||
"no" => "No"
|
||||
]
|
||||
]
|
||||
]
|
||||
);
|
||||
break;
|
||||
/*
|
||||
case "images":
|
||||
return array_merge(
|
||||
$base,
|
||||
[
|
||||
"time" => [ // tbs=qdr:<time>
|
||||
"display" => "Time posted",
|
||||
"option" => [
|
||||
"any" => "Any time",
|
||||
"d" => "Past 24 hours",
|
||||
"w" => "Past week",
|
||||
"m" => "Past month",
|
||||
"y" => "Past year"
|
||||
]
|
||||
],
|
||||
"size" => [ // imgsz
|
||||
"display" => "Size",
|
||||
"option" => [
|
||||
"any" => "Any size",
|
||||
"l" => "Large",
|
||||
"m" => "Medium",
|
||||
"i" => "Icon",
|
||||
"qsvga" => "Larger than 400x300",
|
||||
"vga" => "Larger than 640x480",
|
||||
"svga" => "Larger than 800x600",
|
||||
"xga" => "Larger than 1024x768",
|
||||
"2mp" => "Larger than 2MP",
|
||||
"4mp" => "Larger than 4MP",
|
||||
"6mp" => "Larger than 6MP",
|
||||
"8mp" => "Larger than 8MP",
|
||||
"10mp" => "Larger than 10MP",
|
||||
"12mp" => "Larger than 12MP",
|
||||
"15mp" => "Larger than 15MP",
|
||||
"20mp" => "Larger than 20MP",
|
||||
"40mp" => "Larger than 40MP",
|
||||
"70mp" => "Larger than 70MP"
|
||||
]
|
||||
],
|
||||
"ratio" => [ // imgar
|
||||
"display" => "Aspect ratio",
|
||||
"option" => [
|
||||
"any" => "Any ratio",
|
||||
"t|xt" => "Tall",
|
||||
"s" => "Square",
|
||||
"w" => "Wide",
|
||||
"xw" => "Panoramic"
|
||||
]
|
||||
],
|
||||
"color" => [ // imgc
|
||||
"display" => "Color",
|
||||
"option" => [
|
||||
"any" => "Any color",
|
||||
"color" => "Full color",
|
||||
"bnw" => "Black & white",
|
||||
"trans" => "Transparent",
|
||||
// from here, imgcolor
|
||||
"red" => "Red",
|
||||
"orange" => "Orange",
|
||||
"yellow" => "Yellow",
|
||||
"green" => "Green",
|
||||
"teal" => "Teal",
|
||||
"blue" => "Blue",
|
||||
"purple" => "Purple",
|
||||
"pink" => "Pink",
|
||||
"white" => "White",
|
||||
"gray" => "Gray",
|
||||
"black" => "Black",
|
||||
"brown" => "Brown"
|
||||
]
|
||||
],
|
||||
"type" => [ // tbs=itp:<type>
|
||||
"display" => "Type",
|
||||
"option" => [
|
||||
"any" => "Any type",
|
||||
"clipart" => "Clip Art",
|
||||
"lineart" => "Line Drawing",
|
||||
"animated" => "Animated"
|
||||
]
|
||||
],
|
||||
"format" => [ // as_filetype
|
||||
"display" => "Format",
|
||||
"option" => [
|
||||
"any" => "Any format",
|
||||
"jpg" => "JPG",
|
||||
"gif" => "GIF",
|
||||
"png" => "PNG",
|
||||
"bmp" => "BMP",
|
||||
"svg" => "SVG",
|
||||
"webp" => "WEBP",
|
||||
"ico" => "ICO",
|
||||
"craw" => "RAW"
|
||||
]
|
||||
],
|
||||
"rights" => [ // tbs=sur:<rights>
|
||||
"display" => "Usage rights",
|
||||
"option" => [
|
||||
"any" => "Any license",
|
||||
"cl" => "Creative Commons licenses",
|
||||
"ol" => "Commercial & other licenses"
|
||||
]
|
||||
]
|
||||
]
|
||||
);
|
||||
break;*/
|
||||
}
|
||||
}
|
||||
|
||||
private function get($proxy, $url, $get = []){
|
||||
|
||||
$curlproc = curl_init();
|
||||
|
||||
$headers = [
|
||||
"Accept: application/json",
|
||||
"Accept-Encoding: gzip"
|
||||
];
|
||||
|
||||
if($get !== []){
|
||||
$get = http_build_query($get);
|
||||
$url .= "?" . $get;
|
||||
}
|
||||
|
||||
curl_setopt($curlproc, CURLOPT_URL, $url);
|
||||
|
||||
curl_setopt($curlproc, CURLOPT_ENCODING, ""); // default encoding
|
||||
curl_setopt($curlproc, CURLOPT_HTTPHEADER, $headers);
|
||||
|
||||
curl_setopt($curlproc, CURLOPT_RETURNTRANSFER, true);
|
||||
curl_setopt($curlproc, CURLOPT_SSL_VERIFYHOST, 2);
|
||||
curl_setopt($curlproc, CURLOPT_SSL_VERIFYPEER, true);
|
||||
curl_setopt($curlproc, CURLOPT_CONNECTTIMEOUT, 30);
|
||||
curl_setopt($curlproc, CURLOPT_TIMEOUT, 30);
|
||||
|
||||
// follow redirects
|
||||
curl_setopt($curlproc, CURLOPT_FOLLOWLOCATION, true);
|
||||
|
||||
$this->backend->assign_proxy($curlproc, $proxy);
|
||||
|
||||
$data = curl_exec($curlproc);
|
||||
|
||||
if(curl_errno($curlproc)){
|
||||
|
||||
throw new Exception(curl_error($curlproc));
|
||||
}
|
||||
|
||||
curl_close($curlproc);
|
||||
|
||||
return $data;
|
||||
}
|
||||
|
||||
public function web($get){
|
||||
|
||||
// rotate proxy + key on EVERY request
|
||||
$keydata = $this->backend->get_key();
|
||||
$proxy = $this->backend->get_ip($keydata["increment"]);
|
||||
|
||||
if($get["npt"]){
|
||||
|
||||
// $p is never used
|
||||
[$params, $p] = $this->backend->get(
|
||||
$get["npt"],
|
||||
"web"
|
||||
);
|
||||
|
||||
$params = json_decode($params, true);
|
||||
|
||||
$params["key"] = $keydata["key"];
|
||||
|
||||
}else{
|
||||
|
||||
//$json = file_get_contents("scraper/google.json");
|
||||
$params = [
|
||||
"q" => $get["s"],
|
||||
"cx" => config::GOOGLE_CX_ENDPOINT,
|
||||
"num" => 10,
|
||||
"start" => 1,
|
||||
"key" => $keydata["key"]
|
||||
];
|
||||
|
||||
//
|
||||
// parse filters
|
||||
//
|
||||
if($get["newer"] !== false){
|
||||
|
||||
$params["dateRestrict"] = "d" . (round((time() - $get["newer"]) / 100000));
|
||||
}
|
||||
|
||||
if($get["rm_dupes"] == "no"){ $params["filter"] = "0"; }
|
||||
if($get["country"] != "any"){ $params["gl"] = $get["country"]; }
|
||||
if($get["lang"] != "any"){ $params["lr"] = "lang_" . $get["lang"]; }
|
||||
|
||||
if($get["nsfw"] == "yes"){
|
||||
|
||||
$params["safe"] = "off";
|
||||
}else{
|
||||
|
||||
$params["safe"] = "active";
|
||||
}
|
||||
|
||||
if($get["sort"] != "any"){ $params["sort"] = $get["sort"]; }
|
||||
}
|
||||
|
||||
try{
|
||||
$json =
|
||||
$this->get(
|
||||
$proxy,
|
||||
"https://www.googleapis.com/customsearch/v1",
|
||||
$params
|
||||
);
|
||||
}catch(Exception $error){
|
||||
|
||||
throw new Exception("Failed to fetch JSON");
|
||||
}
|
||||
|
||||
$json = json_decode($json, true);
|
||||
|
||||
if($json === null){
|
||||
|
||||
throw new Exception("Failed to decode JSON");
|
||||
}
|
||||
|
||||
$out = [
|
||||
"status" => "ok",
|
||||
"spelling" => [
|
||||
"type" => "no_correction",
|
||||
"using" => null,
|
||||
"correction" => null
|
||||
],
|
||||
"npt" => null,
|
||||
"answer" => [],
|
||||
"web" => [],
|
||||
"image" => [],
|
||||
"video" => [],
|
||||
"news" => [],
|
||||
"related" => []
|
||||
];
|
||||
|
||||
if(isset($json["error"]["message"])){
|
||||
|
||||
throw new Exception(
|
||||
"API returned an error: " .
|
||||
$json["error"]["message"] .
|
||||
" (key #" . $keydata["increment"] . ")"
|
||||
);
|
||||
}
|
||||
|
||||
if(!isset($json["items"])){
|
||||
|
||||
throw new Exception("Failed to access items array");
|
||||
}
|
||||
|
||||
foreach($json["items"] as $result){
|
||||
|
||||
//
|
||||
// probe for thumbnail
|
||||
//
|
||||
$probes = [
|
||||
isset($result["pagemap"]["cse_thumbnail"][0]["src"]) ? $result["pagemap"]["cse_thumbnail"][0]["src"] : null,
|
||||
isset($result["pagemap"]["cse_image"][0]["src"]) ? $result["pagemap"]["cse_image"][0]["src"] : null,
|
||||
isset($result["pagemap"]["metatags"][0]["twitter:image"]) ? $result["pagemap"]["metatags"][0]["twitter:image"] : null,
|
||||
isset($result["pagemap"]["metatags"][0]["og:image"]) ? $result["pagemap"]["metatags"][0]["og:image"] : null
|
||||
];
|
||||
|
||||
$thumb = [
|
||||
"url" => null,
|
||||
"ratio" => null
|
||||
];
|
||||
|
||||
foreach($probes as $probe){
|
||||
|
||||
if($probe !== null){
|
||||
|
||||
$thumb = [
|
||||
"url" => $probe,
|
||||
"ratio" => "16:9"
|
||||
];
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
//
|
||||
// probe for page format
|
||||
//
|
||||
$mime = "web";
|
||||
if(isset($result["mime"])){
|
||||
|
||||
$result["mime"] =
|
||||
explode(
|
||||
"/",
|
||||
$result["mime"],
|
||||
2
|
||||
);
|
||||
|
||||
if(count($result["mime"]) === 2){
|
||||
|
||||
$mime = strtoupper($result["mime"][1]);
|
||||
}
|
||||
}
|
||||
|
||||
$description = $result["snippet"];
|
||||
|
||||
//
|
||||
// Get date
|
||||
//
|
||||
$description_split =
|
||||
explode(
|
||||
"...", $description, 2
|
||||
);
|
||||
|
||||
if(count($description_split) === 1){
|
||||
|
||||
$description = $result["snippet"];
|
||||
}elseif(strlen($description_split[0]) < 17){
|
||||
|
||||
$date = trim($description_split[0]);
|
||||
$date_probe = strtotime($date);
|
||||
|
||||
if($date_probe !== false){
|
||||
|
||||
$description = $description_split[1];
|
||||
}else{
|
||||
|
||||
//
|
||||
// fallback to getting date from meta tags
|
||||
//
|
||||
if(isset($result["pagemap"]["metatags"][0]["creationdate"])){
|
||||
|
||||
$date = $result["pagemap"]["metatags"][0]["creationdate"];
|
||||
|
||||
}elseif(isset($result["pagemap"]["metatags"][0]["moddate"])){
|
||||
|
||||
$date = $result["pagemap"]["metatags"][0]["moddate"];
|
||||
}else{
|
||||
|
||||
$date = null;
|
||||
}
|
||||
|
||||
$description = $result["snippet"];
|
||||
}
|
||||
}
|
||||
|
||||
if($date !== null){
|
||||
|
||||
$date =
|
||||
strtotime(
|
||||
trim(
|
||||
str_replace(
|
||||
["D:", "'"],
|
||||
"",
|
||||
$date
|
||||
)
|
||||
)
|
||||
);
|
||||
|
||||
if($date === false){
|
||||
|
||||
$date = null;
|
||||
}
|
||||
}
|
||||
|
||||
$out["web"][] = [
|
||||
"title" =>
|
||||
$this->titledots(
|
||||
$result["title"]
|
||||
),
|
||||
"description" =>
|
||||
$this->titledots(
|
||||
$description
|
||||
),
|
||||
"url" => $result["link"],
|
||||
"date" => $date,
|
||||
"type" => $mime,
|
||||
"thumb" => $thumb,
|
||||
"sublink" => [],
|
||||
"table" => []
|
||||
];
|
||||
}
|
||||
|
||||
// get npt
|
||||
if(isset($json["queries"]["nextPage"][0]["startIndex"])){
|
||||
|
||||
$filters["start"] = (int)$json["queries"]["nextPage"][0]["startIndex"];
|
||||
|
||||
unset($params["key"]);
|
||||
$params["start"] += 10;
|
||||
|
||||
$out["npt"] =
|
||||
$this->backend->store(
|
||||
json_encode($params),
|
||||
"web",
|
||||
$proxy
|
||||
);
|
||||
}
|
||||
|
||||
return $out;
|
||||
}
|
||||
|
||||
private function titledots($title){
|
||||
|
||||
return trim($title, " .\t\n\r\0\x0B…");
|
||||
}
|
||||
}
|
@@ -137,6 +137,10 @@ $settings = [
|
||||
"value" => "google",
|
||||
"text" => "Google"
|
||||
],
|
||||
[
|
||||
"value" => "google_api",
|
||||
"text" => "Google API"
|
||||
],
|
||||
[
|
||||
"value" => "google_cse",
|
||||
"text" => "Google CSE"
|
||||
|
Reference in New Issue
Block a user