re-added solofield, added mullvad for brave and google
This commit is contained in:
@@ -43,7 +43,7 @@ class config{
|
||||
|
||||
// If this regex expression matches on the user agent, it blocks the request
|
||||
// Not useful at all against a targetted attack
|
||||
const HEADER_REGEX = '/bot|wget|curl|python-requests|scrapy|go-http-client|ruby|yahoo|spider|qwant/i';
|
||||
const HEADER_REGEX = '/bot|wget|curl|python-requests|scrapy|go-http-client|ruby|yahoo|spider|qwant|meta/i';
|
||||
|
||||
// Block clients who present any of the following headers in their request (SPECIFY IN !!lowercase!!)
|
||||
// Eg: ["x-forwarded-for", "x-via", "forwarded-for", "via"];
|
||||
@@ -118,7 +118,7 @@ class config{
|
||||
|
||||
// Default user agent to use for scraper requests. Sometimes ignored to get specific webpages
|
||||
// Changing this might break things.
|
||||
const USER_AGENT = "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:141.0) Gecko/20100101 Firefox/141.0";
|
||||
const USER_AGENT = "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:142.0) Gecko/20100101 Firefox/142.0";
|
||||
|
||||
// Proxy pool assignments for each scraper
|
||||
// false = Use server's raw IP
|
||||
@@ -130,6 +130,8 @@ class config{
|
||||
const PROXY_GOOGLE = false;
|
||||
const PROXY_GOOGLE_API = false;
|
||||
const PROXY_GOOGLE_CSE = false;
|
||||
const PROXY_MULLVAD_GOOGLE = false;
|
||||
const PROXY_MULLVAD_BRAVE = false;
|
||||
const PROXY_STARTPAGE = false;
|
||||
const PROXY_QWANT = false;
|
||||
const PROXY_BAIDU = false;
|
||||
@@ -143,6 +145,7 @@ class config{
|
||||
const PROXY_WIBY = false;
|
||||
const PROXY_CURLIE = false;
|
||||
const PROXY_YT = false; // youtube
|
||||
const PROXY_ARCHIVEORG = false;
|
||||
const PROXY_SEPIASEARCH = false;
|
||||
const PROXY_ODYSEE = false;
|
||||
const PROXY_VIMEO = false;
|
||||
|
342
scraper/mullvad.php
Normal file
342
scraper/mullvad.php
Normal file
@@ -0,0 +1,342 @@
|
||||
<?php
|
||||
|
||||
class mullvad{
|
||||
|
||||
public function __construct($engine){
|
||||
|
||||
$this->engine = $engine;
|
||||
|
||||
include "lib/backend.php";
|
||||
$this->backend = new backend("mullvad_{$this->engine}");
|
||||
}
|
||||
|
||||
public function getfilters($page){
|
||||
return [
|
||||
"country" => [ // &country=
|
||||
"display" => "Country",
|
||||
"option" => [
|
||||
"any" => "Any country",
|
||||
"ar" => "Argentina",
|
||||
"au" => "Australia",
|
||||
"at" => "Austria",
|
||||
"be" => "Belgium",
|
||||
"br" => "Brazil",
|
||||
"ca" => "Canada",
|
||||
"cl" => "Chile",
|
||||
"cn" => "China",
|
||||
"dk" => "Denmark",
|
||||
"fi" => "Finland",
|
||||
"fr" => "France",
|
||||
"de" => "Germany",
|
||||
"hk" => "Hong Kong",
|
||||
"in" => "India",
|
||||
"id" => "Indonesia",
|
||||
"it" => "Italy",
|
||||
"jp" => "Japan",
|
||||
"kr" => "Korea, Republic",
|
||||
"my" => "Malaysia",
|
||||
"mx" => "Mexico",
|
||||
"nl" => "Netherlands",
|
||||
"nz" => "New Zealand",
|
||||
"no" => "Norway",
|
||||
"ph" => "Philippines",
|
||||
"pl" => "Poland",
|
||||
"pt" => "Portugal",
|
||||
"ru" => "Russian Federation",
|
||||
"sa" => "Saudi Arabia",
|
||||
"za" => "South Africa",
|
||||
"es" => "Spain",
|
||||
"se" => "Sweden",
|
||||
"ch" => "Switzerland",
|
||||
"tw" => "Taiwan",
|
||||
"tr" => "Turkey",
|
||||
"uk" => "United Kingdom",
|
||||
"us" => "United States"
|
||||
]
|
||||
],
|
||||
"language" => [ // &language=
|
||||
"display" => "Language",
|
||||
"option" => [
|
||||
"any" => "Any language",
|
||||
"ar" => "Arabic",
|
||||
"bg" => "Bulgarian",
|
||||
"ca" => "Catalan",
|
||||
"zh-hans" => "Chinese (Simplified)",
|
||||
"zh-hant" => "Chinese (Traditional)",
|
||||
"hr" => "Croatian",
|
||||
"cs" => "Czech",
|
||||
"da" => "Danish",
|
||||
"nl" => "Dutch",
|
||||
"en" => "English",
|
||||
"et" => "Estonian",
|
||||
"fi" => "Finnish",
|
||||
"fr" => "French",
|
||||
"de" => "German",
|
||||
"he" => "Hebrew",
|
||||
"hu" => "Hungarian",
|
||||
"is" => "Icelandic",
|
||||
"it" => "Italian",
|
||||
"jp" => "Japanese",
|
||||
"ko" => "Korean",
|
||||
"lv" => "Latvian",
|
||||
"lt" => "Lithuanian",
|
||||
"nb" => "Norwegian",
|
||||
"pl" => "Polish",
|
||||
"pt" => "Portuguese",
|
||||
"ro" => "Romanian",
|
||||
"ru" => "Russian",
|
||||
"sr" => "Serbian",
|
||||
"sk" => "Slovak",
|
||||
"sl" => "Slovenian",
|
||||
"es" => "Spanish",
|
||||
"sv" => "Swedish",
|
||||
"tr" => "Turkish"
|
||||
]
|
||||
],
|
||||
"time" => [ // &lastUpdated=
|
||||
"display" => "Time posted",
|
||||
"option" => [
|
||||
"any" => "Any time",
|
||||
"d" => "Past day",
|
||||
"w" => "Past week",
|
||||
"m" => "Past month",
|
||||
"y" => "Past year"
|
||||
]
|
||||
]
|
||||
];
|
||||
}
|
||||
|
||||
private function get($proxy, $url, $get = []){
|
||||
|
||||
$curlproc = curl_init();
|
||||
|
||||
if($get !== []){
|
||||
$get = http_build_query($get);
|
||||
$url .= "?" . $get;
|
||||
}
|
||||
|
||||
curl_setopt($curlproc, CURLOPT_URL, $url);
|
||||
|
||||
// http2 bypass
|
||||
curl_setopt($curlproc, CURLOPT_HTTP_VERSION, CURL_HTTP_VERSION_2_0);
|
||||
|
||||
curl_setopt($curlproc, CURLOPT_ENCODING, ""); // default encoding
|
||||
curl_setopt($curlproc, CURLOPT_HTTPHEADER,
|
||||
["User-Agent: " . config::USER_AGENT,
|
||||
"Accept: */*",
|
||||
"Accept-Language: en-US,en;q=0.5",
|
||||
"Accept-Encoding: gzip, deflate, br, zstd",
|
||||
"Referer: https://leta.mullvad.net/search",
|
||||
"DNT: 1",
|
||||
"Sec-GPC: 1",
|
||||
"Connection: keep-alive",
|
||||
"Cookie: engine=brave",
|
||||
"Sec-Fetch-Dest: empty",
|
||||
"Sec-Fetch-Mode: cors",
|
||||
"Sec-Fetch-Site: same-origin",
|
||||
"Priority: u=0",
|
||||
"TE: trailers"]
|
||||
);
|
||||
|
||||
curl_setopt($curlproc, CURLOPT_RETURNTRANSFER, true);
|
||||
curl_setopt($curlproc, CURLOPT_SSL_VERIFYHOST, 2);
|
||||
curl_setopt($curlproc, CURLOPT_SSL_VERIFYPEER, true);
|
||||
curl_setopt($curlproc, CURLOPT_CONNECTTIMEOUT, 30);
|
||||
curl_setopt($curlproc, CURLOPT_TIMEOUT, 30);
|
||||
|
||||
$this->backend->assign_proxy($curlproc, $proxy);
|
||||
|
||||
$data = curl_exec($curlproc);
|
||||
|
||||
if(curl_errno($curlproc)){
|
||||
|
||||
throw new Exception(curl_error($curlproc));
|
||||
}
|
||||
|
||||
curl_close($curlproc);
|
||||
return $data;
|
||||
}
|
||||
|
||||
public function web($get){
|
||||
|
||||
if($get["npt"]){
|
||||
|
||||
[$params, $proxy] = $this->backend->get($get["npt"], "web");
|
||||
$params = json_decode($params, true);
|
||||
|
||||
}else{
|
||||
|
||||
if(strlen($get["s"]) === 0){
|
||||
|
||||
throw new Exception("Search term is empty!");
|
||||
}
|
||||
|
||||
// generate filters
|
||||
$params = [
|
||||
"q" => $get["s"],
|
||||
"engine" => $this->engine,
|
||||
"page" => 1
|
||||
];
|
||||
|
||||
if($get["country"] != "any"){
|
||||
|
||||
$params["country"] = $get["country"];
|
||||
}
|
||||
|
||||
if($get["language"] != "any"){
|
||||
|
||||
$params["language"] = $get["language"];
|
||||
}
|
||||
|
||||
if($get["time"] != "any"){
|
||||
|
||||
$params["lastUpdated"] = $get["time"];
|
||||
}
|
||||
|
||||
$proxy = $this->backend->get_ip();
|
||||
}
|
||||
|
||||
try{
|
||||
$json = $this->get(
|
||||
$proxy,
|
||||
"https://leta.mullvad.net/search/__data.json",
|
||||
$params
|
||||
);
|
||||
}catch(Exception $error){
|
||||
|
||||
throw new Exception("Failed to fetch search page");
|
||||
}
|
||||
|
||||
$json = json_decode($json, true);
|
||||
|
||||
if($json === null){
|
||||
|
||||
throw new Exception("Failed to decode JSON");
|
||||
}
|
||||
|
||||
if(!isset($json["nodes"])){
|
||||
|
||||
throw new Exception("Mullvad did not return a nodes object");
|
||||
}
|
||||
|
||||
$out = [
|
||||
"status" => "ok",
|
||||
"spelling" => [
|
||||
"type" => "no_correction",
|
||||
"using" => null,
|
||||
"correction" => null
|
||||
],
|
||||
"npt" => $nextpage,
|
||||
"answer" => [],
|
||||
"web" => [],
|
||||
"image" => [],
|
||||
"video" => [],
|
||||
"news" => [],
|
||||
"related" => []
|
||||
];
|
||||
|
||||
// parse json payload
|
||||
foreach($json["nodes"] as $node){
|
||||
|
||||
if(!isset($node["data"][0]["q"])){
|
||||
|
||||
// not iterating through the query object
|
||||
continue;
|
||||
}
|
||||
|
||||
// node 0 contains pointers to what we need to iterate through
|
||||
$node0 = &$node["data"][0];
|
||||
|
||||
if(!isset($node["data"][$node0["success"]])){
|
||||
|
||||
throw new Exception("Mullvad did not return a success object");
|
||||
}
|
||||
|
||||
$success = &$node["data"][$node0["success"]];
|
||||
|
||||
if($success === false){
|
||||
|
||||
throw new Exception("Mullvad flagged the response as unsuccessful");
|
||||
}
|
||||
|
||||
if(!isset($node["data"][$node0["items"]])){
|
||||
|
||||
throw new Exception("Mullvad did not return an items object");
|
||||
}
|
||||
|
||||
$search_pointers = &$node["data"][$node0["items"]];
|
||||
|
||||
//
|
||||
// Iterate over results
|
||||
//
|
||||
foreach($search_pointers as $pointer){
|
||||
|
||||
$pointer = &$node["data"][$pointer];
|
||||
|
||||
$link = &$node["data"][$pointer["link"]];
|
||||
$title = &$node["data"][$pointer["title"]];
|
||||
$description = &$node["data"][$pointer["snippet"]];
|
||||
|
||||
$date = null;
|
||||
if($this->engine == "google"){
|
||||
|
||||
// attempt to extract date
|
||||
// Jan 12, 2017
|
||||
$date_parts = explode(" ... ", $description, 2);
|
||||
|
||||
if(
|
||||
count($date_parts) === 2 &&
|
||||
strlen($date_parts[0]) < 15
|
||||
){
|
||||
|
||||
$date = strtotime(trim($date_parts[0]));
|
||||
|
||||
if($date === false){
|
||||
|
||||
$date = null;
|
||||
}else{
|
||||
|
||||
$description = trim($date_parts[1]);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
$out["web"][] = [
|
||||
"title" => $this->titledots($title),
|
||||
"description" => $this->titledots($description),
|
||||
"url" => $link,
|
||||
"date" => $date,
|
||||
"type" => "web",
|
||||
"thumb" => [
|
||||
"url" => null,
|
||||
"ratio" => null
|
||||
],
|
||||
"sublink" => [],
|
||||
"table" => []
|
||||
];
|
||||
}
|
||||
|
||||
//
|
||||
// Get nextpage
|
||||
//
|
||||
if(isset($node["data"][$node0["next"]])){
|
||||
|
||||
$params["page"] = (int)$node["data"][$node0["next"]];
|
||||
|
||||
$out["npt"] =
|
||||
$this->backend->store(
|
||||
json_encode($params),
|
||||
"web",
|
||||
$proxy
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
return $out;
|
||||
}
|
||||
|
||||
private function titledots($title){
|
||||
|
||||
return trim($title, " .\t\n\r\0\x0B…");
|
||||
}
|
||||
}
|
20
scraper/mullvad_brave.php
Normal file
20
scraper/mullvad_brave.php
Normal file
@@ -0,0 +1,20 @@
|
||||
<?php
|
||||
|
||||
class mullvad_brave{
|
||||
|
||||
public function __construct(){
|
||||
|
||||
include "scraper/mullvad.php";
|
||||
$this->mullvad = new mullvad("brave");
|
||||
}
|
||||
|
||||
public function getfilters($page){
|
||||
|
||||
return $this->mullvad->getfilters($page);
|
||||
}
|
||||
|
||||
public function web($get){
|
||||
|
||||
return $this->mullvad->web($get);
|
||||
}
|
||||
}
|
20
scraper/mullvad_google.php
Normal file
20
scraper/mullvad_google.php
Normal file
@@ -0,0 +1,20 @@
|
||||
<?php
|
||||
|
||||
class mullvad_google{
|
||||
|
||||
public function __construct(){
|
||||
|
||||
include "scraper/mullvad.php";
|
||||
$this->mullvad = new mullvad("google");
|
||||
}
|
||||
|
||||
public function getfilters($page){
|
||||
|
||||
return $this->mullvad->getfilters($page);
|
||||
}
|
||||
|
||||
public function web($get){
|
||||
|
||||
return $this->mullvad->web($get);
|
||||
}
|
||||
}
|
20
settings.php
20
settings.php
@@ -125,6 +125,10 @@ $settings = [
|
||||
"value" => "brave",
|
||||
"text" => "Brave"
|
||||
],
|
||||
[
|
||||
"value" => "mullvad_brave",
|
||||
"text" => "Mullvad (Brave)"
|
||||
],
|
||||
[
|
||||
"value" => "yandex",
|
||||
"text" => "Yandex"
|
||||
@@ -137,6 +141,10 @@ $settings = [
|
||||
"value" => "google_cse",
|
||||
"text" => "Google CSE"
|
||||
],
|
||||
[
|
||||
"value" => "mullvad_google",
|
||||
"text" => "Mullvad (Google)"
|
||||
],
|
||||
[
|
||||
"value" => "startpage",
|
||||
"text" => "Startpage"
|
||||
@@ -177,6 +185,10 @@ $settings = [
|
||||
"value" => "coccoc",
|
||||
"text" => "Cốc Cốc"
|
||||
],
|
||||
[
|
||||
"value" => "solofield",
|
||||
"text" => "Solofield"
|
||||
],
|
||||
[
|
||||
"value" => "marginalia",
|
||||
"text" => "Marginalia"
|
||||
@@ -231,6 +243,10 @@ $settings = [
|
||||
"value" => "baidu",
|
||||
"text" => "Baidu"
|
||||
],
|
||||
[
|
||||
"value" => "solofield",
|
||||
"text" => "Solofield"
|
||||
],
|
||||
[
|
||||
"value" => "pinterest",
|
||||
"text" => "Pinterest"
|
||||
@@ -308,6 +324,10 @@ $settings = [
|
||||
[
|
||||
"value" => "coccoc",
|
||||
"text" => "Cốc Cốc"
|
||||
],
|
||||
[
|
||||
"value" => "solofield",
|
||||
"text" => "Solofield"
|
||||
]
|
||||
]
|
||||
],
|
||||
|
Reference in New Issue
Block a user