greppr fix
This commit is contained in:
parent
ad535a1609
commit
319640cd77
|
@ -16,49 +16,82 @@ class greppr{
|
||||||
return [];
|
return [];
|
||||||
}
|
}
|
||||||
|
|
||||||
private function get($proxy, $url, $get = [], $cookie = false){
|
private function get($proxy, $url, $get = [], $cookie = false, $post){
|
||||||
|
|
||||||
$curlproc = curl_init();
|
$curlproc = curl_init();
|
||||||
|
|
||||||
if($get !== []){
|
|
||||||
$get = http_build_query($get);
|
|
||||||
$url .= "?" . $get;
|
|
||||||
}
|
|
||||||
|
|
||||||
curl_setopt($curlproc, CURLOPT_URL, $url);
|
curl_setopt($curlproc, CURLOPT_URL, $url);
|
||||||
|
|
||||||
curl_setopt($curlproc, CURLOPT_ENCODING, ""); // default encoding
|
curl_setopt($curlproc, CURLOPT_ENCODING, ""); // default encoding
|
||||||
|
|
||||||
if($cookie === false){
|
if($post === false){
|
||||||
|
|
||||||
|
if($get !== []){
|
||||||
|
$get = http_build_query($get);
|
||||||
|
$url .= "?" . $get;
|
||||||
|
}
|
||||||
|
|
||||||
curl_setopt($curlproc, CURLOPT_HTTPHEADER,
|
if($cookie === false){
|
||||||
["User-Agent: " . config::USER_AGENT,
|
|
||||||
"Accept: text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8",
|
curl_setopt($curlproc, CURLOPT_HTTPHEADER,
|
||||||
"Accept-Language: en-US,en;q=0.5",
|
["User-Agent: " . config::USER_AGENT,
|
||||||
"Accept-Encoding: gzip",
|
"Accept: text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8",
|
||||||
"DNT: 1",
|
"Accept-Language: en-US,en;q=0.5",
|
||||||
"Connection: keep-alive",
|
"Accept-Encoding: gzip",
|
||||||
"Upgrade-Insecure-Requests: 1",
|
"DNT: 1",
|
||||||
"Sec-Fetch-Dest: document",
|
"Connection: keep-alive",
|
||||||
"Sec-Fetch-Mode: navigate",
|
"Upgrade-Insecure-Requests: 1",
|
||||||
"Sec-Fetch-Site: none",
|
"Sec-Fetch-Dest: document",
|
||||||
"Sec-Fetch-User: ?1"]
|
"Sec-Fetch-Mode: navigate",
|
||||||
);
|
"Sec-Fetch-Site: none",
|
||||||
|
"Sec-Fetch-User: ?1"]
|
||||||
|
);
|
||||||
|
}else{
|
||||||
|
|
||||||
|
curl_setopt($curlproc, CURLOPT_HTTPHEADER,
|
||||||
|
["User-Agent: " . config::USER_AGENT,
|
||||||
|
"Accept: text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
|
||||||
|
"Accept-Language: en-US,en;q=0.5",
|
||||||
|
"Accept-Encoding: gzip, deflate, br, zstd",
|
||||||
|
"DNT: 1",
|
||||||
|
"Sec-GPC: 1",
|
||||||
|
"Connection: keep-alive",
|
||||||
|
"Referer: https://greppr.org/search",
|
||||||
|
"Cookie: PHPSESSID=$cookie",
|
||||||
|
"Upgrade-Insecure-Requests: 1",
|
||||||
|
"Sec-Fetch-Dest: document",
|
||||||
|
"Sec-Fetch-Mode: navigate",
|
||||||
|
"Sec-Fetch-Site: same-origin",
|
||||||
|
"Sec-Fetch-User: ?1",
|
||||||
|
"Priority: u=0, i"]
|
||||||
|
);
|
||||||
|
}
|
||||||
}else{
|
}else{
|
||||||
|
|
||||||
|
$get = http_build_query($get);
|
||||||
|
|
||||||
|
curl_setopt($curlproc, CURLOPT_POST, true);
|
||||||
|
curl_setopt($curlproc, CURLOPT_POSTFIELDS, $get);
|
||||||
|
|
||||||
curl_setopt($curlproc, CURLOPT_HTTPHEADER,
|
curl_setopt($curlproc, CURLOPT_HTTPHEADER,
|
||||||
["User-Agent: " . config::USER_AGENT,
|
["User-Agent: " . config::USER_AGENT,
|
||||||
"Accept: text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8",
|
"Accept: text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
|
||||||
"Accept-Language: en-US,en;q=0.5",
|
"Accept-Language: en-US,en;q=0.5",
|
||||||
"Accept-Encoding: gzip",
|
"Accept-Encoding: gzip, deflate, br, zstd",
|
||||||
"Cookie: PHPSESSID=" . $cookie,
|
"Content-Type: application/x-www-form-urlencoded",
|
||||||
|
"Content-Length: " . strlen($get),
|
||||||
|
"Origin: https://greppr.org",
|
||||||
"DNT: 1",
|
"DNT: 1",
|
||||||
|
"Sec-GPC: 1",
|
||||||
"Connection: keep-alive",
|
"Connection: keep-alive",
|
||||||
|
"Referer: https://greppr.org/",
|
||||||
|
"Cookie: PHPSESSID=$cookie",
|
||||||
"Upgrade-Insecure-Requests: 1",
|
"Upgrade-Insecure-Requests: 1",
|
||||||
"Sec-Fetch-Dest: document",
|
"Sec-Fetch-Dest: document",
|
||||||
"Sec-Fetch-Mode: navigate",
|
"Sec-Fetch-Mode: navigate",
|
||||||
"Sec-Fetch-Site: none",
|
"Sec-Fetch-Site: same-origin",
|
||||||
"Sec-Fetch-User: ?1"]
|
"Sec-Fetch-User: ?1",
|
||||||
|
"Priority: u=0, i"]
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -113,7 +146,24 @@ class greppr{
|
||||||
|
|
||||||
[$q, $proxy] = $this->backend->get($get["npt"], "web");
|
[$q, $proxy] = $this->backend->get($get["npt"], "web");
|
||||||
|
|
||||||
$q = json_decode($q, true);
|
$tokens = json_decode($q, true);
|
||||||
|
|
||||||
|
//
|
||||||
|
// Get paginated page
|
||||||
|
//
|
||||||
|
try{
|
||||||
|
|
||||||
|
$html = $this->get(
|
||||||
|
$proxy,
|
||||||
|
"https://greppr.org" . $tokens["get"],
|
||||||
|
[],
|
||||||
|
$tokens["cookie"],
|
||||||
|
false
|
||||||
|
);
|
||||||
|
}catch(Exception $error){
|
||||||
|
|
||||||
|
throw new Exception("Failed to fetch search page");
|
||||||
|
}
|
||||||
|
|
||||||
}else{
|
}else{
|
||||||
|
|
||||||
|
@ -124,88 +174,114 @@ class greppr{
|
||||||
}
|
}
|
||||||
|
|
||||||
$proxy = $this->backend->get_ip();
|
$proxy = $this->backend->get_ip();
|
||||||
}
|
|
||||||
|
|
||||||
// get token
|
|
||||||
// token[0] = static token that changes once a day
|
|
||||||
// token[1] = dynamic token that changes on every request
|
|
||||||
// token[1] = PHPSESSID cookie
|
|
||||||
$tokens = apcu_fetch("greppr_token");
|
|
||||||
|
|
||||||
if(
|
|
||||||
$tokens === false ||
|
|
||||||
$first_attempt === false // force token fetch
|
|
||||||
){
|
|
||||||
|
|
||||||
// we haven't gotten the token yet, get it
|
//
|
||||||
|
// get token
|
||||||
|
//
|
||||||
try{
|
try{
|
||||||
|
|
||||||
$response =
|
$html =
|
||||||
$this->get(
|
$this->get(
|
||||||
$proxy,
|
$proxy,
|
||||||
"https://greppr.org",
|
"https://greppr.org",
|
||||||
[]
|
[],
|
||||||
|
false,
|
||||||
|
false
|
||||||
);
|
);
|
||||||
}catch(Exception $error){
|
}catch(Exception $error){
|
||||||
|
|
||||||
throw new Exception("Failed to fetch search tokens");
|
throw new Exception("Failed to fetch search tokens");
|
||||||
}
|
}
|
||||||
|
|
||||||
$tokens = $this->parse_token($response);
|
//
|
||||||
|
// Parse token
|
||||||
|
//
|
||||||
|
$this->fuckhtml->load($html["data"]);
|
||||||
|
|
||||||
|
$tokens = [];
|
||||||
|
|
||||||
|
$inputs =
|
||||||
|
$this->fuckhtml
|
||||||
|
->getElementsByTagName(
|
||||||
|
"input"
|
||||||
|
);
|
||||||
|
|
||||||
|
foreach($inputs as $input){
|
||||||
|
|
||||||
|
if(!isset($input["attributes"]["name"])){
|
||||||
|
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
switch($input["attributes"]["name"]){
|
||||||
|
|
||||||
|
case "var1":
|
||||||
|
case "var2":
|
||||||
|
case "n":
|
||||||
|
$tokens[$input["attributes"]["name"]] =
|
||||||
|
$this->fuckhtml
|
||||||
|
->getTextContent(
|
||||||
|
$input["attributes"]["value"]
|
||||||
|
);
|
||||||
|
break;
|
||||||
|
|
||||||
|
default:
|
||||||
|
$tokens["req"] =
|
||||||
|
$this->fuckhtml
|
||||||
|
->getTextContent(
|
||||||
|
$input["attributes"]["name"]
|
||||||
|
);
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// get cookie
|
||||||
|
preg_match(
|
||||||
|
'/PHPSESSID=([^;]+)/',
|
||||||
|
$html["headers"]["set-cookie"],
|
||||||
|
$cookie
|
||||||
|
);
|
||||||
|
|
||||||
|
if(!isset($cookie[1])){
|
||||||
|
|
||||||
|
// server sent an unexpected cookie
|
||||||
|
throw new Exception("Got malformed cookie");
|
||||||
|
}
|
||||||
|
|
||||||
|
$tokens["cookie"] = $cookie[1];
|
||||||
|
|
||||||
if($tokens === false){
|
if($tokens === false){
|
||||||
|
|
||||||
throw new Exception("Failed to grep search tokens");
|
throw new Exception("Failed to grep search tokens");
|
||||||
}
|
}
|
||||||
}
|
|
||||||
|
|
||||||
try{
|
|
||||||
|
|
||||||
if($get["npt"]){
|
//
|
||||||
|
// Get initial search page
|
||||||
|
//
|
||||||
|
try{
|
||||||
|
|
||||||
|
$html = $this->get(
|
||||||
|
$proxy,
|
||||||
|
"https://greppr.org/search",
|
||||||
|
[
|
||||||
|
"var1" => $tokens["var1"],
|
||||||
|
"var2" => $tokens["var2"],
|
||||||
|
$tokens["req"] => $search,
|
||||||
|
"n" => $tokens["n"]
|
||||||
|
],
|
||||||
|
$tokens["cookie"],
|
||||||
|
true
|
||||||
|
);
|
||||||
|
}catch(Exception $error){
|
||||||
|
|
||||||
$params = [
|
throw new Exception("Failed to fetch search page");
|
||||||
$tokens[0] => $q["q"],
|
|
||||||
"s" => $q["s"],
|
|
||||||
"l" => 30,
|
|
||||||
"n" => $tokens[1]
|
|
||||||
];
|
|
||||||
}else{
|
|
||||||
|
|
||||||
$params = [
|
|
||||||
$tokens[0] => $search,
|
|
||||||
"n" => $tokens[1]
|
|
||||||
];
|
|
||||||
}
|
}
|
||||||
|
|
||||||
$searchresults = $this->get(
|
|
||||||
$proxy,
|
|
||||||
"https://greppr.org/search",
|
|
||||||
$params,
|
|
||||||
$tokens[2]
|
|
||||||
);
|
|
||||||
}catch(Exception $error){
|
|
||||||
|
|
||||||
throw new Exception("Failed to fetch search page");
|
|
||||||
}
|
}
|
||||||
|
|
||||||
if(strlen($searchresults["data"]) === 0){
|
//$html = file_get_contents("scraper/greppr.html");
|
||||||
|
//$this->fuckhtml->load($html);
|
||||||
// redirected to main page, which means we got old token
|
$this->fuckhtml->load($html["data"]);
|
||||||
// generate a new one
|
|
||||||
|
|
||||||
// ... unless we just tried to do that
|
|
||||||
if($first_attempt === false){
|
|
||||||
|
|
||||||
throw new Exception("Failed to get a new search token");
|
|
||||||
}
|
|
||||||
|
|
||||||
return $this->web($get, false);
|
|
||||||
}
|
|
||||||
|
|
||||||
// refresh the token with new data (this also triggers fuckhtml load)
|
|
||||||
$this->parse_token($searchresults, $tokens[2]);
|
|
||||||
|
|
||||||
// response object
|
|
||||||
$out = [
|
$out = [
|
||||||
"status" => "ok",
|
"status" => "ok",
|
||||||
"spelling" => [
|
"spelling" => [
|
||||||
|
@ -254,24 +330,16 @@ class greppr{
|
||||||
|
|
||||||
if($break === true){
|
if($break === true){
|
||||||
|
|
||||||
parse_str(
|
|
||||||
$this->fuckhtml
|
|
||||||
->getTextContent(
|
|
||||||
$a["attributes"]["href"]
|
|
||||||
),
|
|
||||||
$values
|
|
||||||
);
|
|
||||||
|
|
||||||
$values = array_values($values);
|
|
||||||
|
|
||||||
$out["npt"] =
|
$out["npt"] =
|
||||||
$this->backend->store(
|
$this->backend->store(
|
||||||
json_encode(
|
json_encode([
|
||||||
[
|
"get" =>
|
||||||
"q" => $values[0],
|
$this->fuckhtml
|
||||||
"s" => $values[1]
|
->getTextContent(
|
||||||
]
|
$a["attributes"]["href"]
|
||||||
),
|
),
|
||||||
|
"cookie" => $tokens["cookie"]
|
||||||
|
]),
|
||||||
"web",
|
"web",
|
||||||
$proxy
|
$proxy
|
||||||
);
|
);
|
||||||
|
@ -360,74 +428,6 @@ class greppr{
|
||||||
return $out;
|
return $out;
|
||||||
}
|
}
|
||||||
|
|
||||||
private function parse_token($response, $cookie = false){
|
|
||||||
|
|
||||||
$this->fuckhtml->load($response["data"]);
|
|
||||||
|
|
||||||
$scripts =
|
|
||||||
$this->fuckhtml
|
|
||||||
->getElementsByTagName("script");
|
|
||||||
|
|
||||||
$found = false;
|
|
||||||
foreach($scripts as $script){
|
|
||||||
|
|
||||||
preg_match(
|
|
||||||
'/window\.location ?= ?\'\/search\?([^=]+).*&n=([0-9]+)/',
|
|
||||||
$script["innerHTML"],
|
|
||||||
$tokens
|
|
||||||
);
|
|
||||||
|
|
||||||
if(isset($tokens[1])){
|
|
||||||
|
|
||||||
$found = true;
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
if($found === false){
|
|
||||||
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
|
|
||||||
$tokens = [
|
|
||||||
$tokens[1],
|
|
||||||
$tokens[2]
|
|
||||||
];
|
|
||||||
|
|
||||||
if($cookie !== false){
|
|
||||||
|
|
||||||
// we already specified a cookie, so use the one we have already
|
|
||||||
$tokens[] = $cookie;
|
|
||||||
apcu_store("greppr_token", $tokens);
|
|
||||||
|
|
||||||
return $tokens;
|
|
||||||
}
|
|
||||||
|
|
||||||
if(!isset($response["headers"]["set-cookie"])){
|
|
||||||
|
|
||||||
// server didn't send a cookie
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
|
|
||||||
// get cookie
|
|
||||||
preg_match(
|
|
||||||
'/PHPSESSID=([^;]+)/',
|
|
||||||
$response["headers"]["set-cookie"],
|
|
||||||
$cookie
|
|
||||||
);
|
|
||||||
|
|
||||||
if(!isset($cookie[1])){
|
|
||||||
|
|
||||||
// server sent an unexpected cookie
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
|
|
||||||
$tokens[] = $cookie[1];
|
|
||||||
apcu_store("greppr_token", $tokens);
|
|
||||||
|
|
||||||
return $tokens;
|
|
||||||
}
|
|
||||||
|
|
||||||
private function limitstrlen($text){
|
private function limitstrlen($text){
|
||||||
|
|
||||||
return explode("\n", wordwrap($text, 300, "\n"))[0];
|
return explode("\n", wordwrap($text, 300, "\n"))[0];
|
||||||
|
|
Loading…
Reference in New Issue