re-added solofield, added mullvad for brave and google
This commit is contained in:
@@ -43,7 +43,7 @@ class config{
|
|||||||
|
|
||||||
// If this regex expression matches on the user agent, it blocks the request
|
// If this regex expression matches on the user agent, it blocks the request
|
||||||
// Not useful at all against a targetted attack
|
// Not useful at all against a targetted attack
|
||||||
const HEADER_REGEX = '/bot|wget|curl|python-requests|scrapy|go-http-client|ruby|yahoo|spider|qwant/i';
|
const HEADER_REGEX = '/bot|wget|curl|python-requests|scrapy|go-http-client|ruby|yahoo|spider|qwant|meta/i';
|
||||||
|
|
||||||
// Block clients who present any of the following headers in their request (SPECIFY IN !!lowercase!!)
|
// Block clients who present any of the following headers in their request (SPECIFY IN !!lowercase!!)
|
||||||
// Eg: ["x-forwarded-for", "x-via", "forwarded-for", "via"];
|
// Eg: ["x-forwarded-for", "x-via", "forwarded-for", "via"];
|
||||||
@@ -118,7 +118,7 @@ class config{
|
|||||||
|
|
||||||
// Default user agent to use for scraper requests. Sometimes ignored to get specific webpages
|
// Default user agent to use for scraper requests. Sometimes ignored to get specific webpages
|
||||||
// Changing this might break things.
|
// Changing this might break things.
|
||||||
const USER_AGENT = "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:141.0) Gecko/20100101 Firefox/141.0";
|
const USER_AGENT = "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:142.0) Gecko/20100101 Firefox/142.0";
|
||||||
|
|
||||||
// Proxy pool assignments for each scraper
|
// Proxy pool assignments for each scraper
|
||||||
// false = Use server's raw IP
|
// false = Use server's raw IP
|
||||||
@@ -130,6 +130,8 @@ class config{
|
|||||||
const PROXY_GOOGLE = false;
|
const PROXY_GOOGLE = false;
|
||||||
const PROXY_GOOGLE_API = false;
|
const PROXY_GOOGLE_API = false;
|
||||||
const PROXY_GOOGLE_CSE = false;
|
const PROXY_GOOGLE_CSE = false;
|
||||||
|
const PROXY_MULLVAD_GOOGLE = false;
|
||||||
|
const PROXY_MULLVAD_BRAVE = false;
|
||||||
const PROXY_STARTPAGE = false;
|
const PROXY_STARTPAGE = false;
|
||||||
const PROXY_QWANT = false;
|
const PROXY_QWANT = false;
|
||||||
const PROXY_BAIDU = false;
|
const PROXY_BAIDU = false;
|
||||||
@@ -143,6 +145,7 @@ class config{
|
|||||||
const PROXY_WIBY = false;
|
const PROXY_WIBY = false;
|
||||||
const PROXY_CURLIE = false;
|
const PROXY_CURLIE = false;
|
||||||
const PROXY_YT = false; // youtube
|
const PROXY_YT = false; // youtube
|
||||||
|
const PROXY_ARCHIVEORG = false;
|
||||||
const PROXY_SEPIASEARCH = false;
|
const PROXY_SEPIASEARCH = false;
|
||||||
const PROXY_ODYSEE = false;
|
const PROXY_ODYSEE = false;
|
||||||
const PROXY_VIMEO = false;
|
const PROXY_VIMEO = false;
|
||||||
|
342
scraper/mullvad.php
Normal file
342
scraper/mullvad.php
Normal file
@@ -0,0 +1,342 @@
|
|||||||
|
<?php
|
||||||
|
|
||||||
|
class mullvad{
|
||||||
|
|
||||||
|
public function __construct($engine){
|
||||||
|
|
||||||
|
$this->engine = $engine;
|
||||||
|
|
||||||
|
include "lib/backend.php";
|
||||||
|
$this->backend = new backend("mullvad_{$this->engine}");
|
||||||
|
}
|
||||||
|
|
||||||
|
public function getfilters($page){
|
||||||
|
return [
|
||||||
|
"country" => [ // &country=
|
||||||
|
"display" => "Country",
|
||||||
|
"option" => [
|
||||||
|
"any" => "Any country",
|
||||||
|
"ar" => "Argentina",
|
||||||
|
"au" => "Australia",
|
||||||
|
"at" => "Austria",
|
||||||
|
"be" => "Belgium",
|
||||||
|
"br" => "Brazil",
|
||||||
|
"ca" => "Canada",
|
||||||
|
"cl" => "Chile",
|
||||||
|
"cn" => "China",
|
||||||
|
"dk" => "Denmark",
|
||||||
|
"fi" => "Finland",
|
||||||
|
"fr" => "France",
|
||||||
|
"de" => "Germany",
|
||||||
|
"hk" => "Hong Kong",
|
||||||
|
"in" => "India",
|
||||||
|
"id" => "Indonesia",
|
||||||
|
"it" => "Italy",
|
||||||
|
"jp" => "Japan",
|
||||||
|
"kr" => "Korea, Republic",
|
||||||
|
"my" => "Malaysia",
|
||||||
|
"mx" => "Mexico",
|
||||||
|
"nl" => "Netherlands",
|
||||||
|
"nz" => "New Zealand",
|
||||||
|
"no" => "Norway",
|
||||||
|
"ph" => "Philippines",
|
||||||
|
"pl" => "Poland",
|
||||||
|
"pt" => "Portugal",
|
||||||
|
"ru" => "Russian Federation",
|
||||||
|
"sa" => "Saudi Arabia",
|
||||||
|
"za" => "South Africa",
|
||||||
|
"es" => "Spain",
|
||||||
|
"se" => "Sweden",
|
||||||
|
"ch" => "Switzerland",
|
||||||
|
"tw" => "Taiwan",
|
||||||
|
"tr" => "Turkey",
|
||||||
|
"uk" => "United Kingdom",
|
||||||
|
"us" => "United States"
|
||||||
|
]
|
||||||
|
],
|
||||||
|
"language" => [ // &language=
|
||||||
|
"display" => "Language",
|
||||||
|
"option" => [
|
||||||
|
"any" => "Any language",
|
||||||
|
"ar" => "Arabic",
|
||||||
|
"bg" => "Bulgarian",
|
||||||
|
"ca" => "Catalan",
|
||||||
|
"zh-hans" => "Chinese (Simplified)",
|
||||||
|
"zh-hant" => "Chinese (Traditional)",
|
||||||
|
"hr" => "Croatian",
|
||||||
|
"cs" => "Czech",
|
||||||
|
"da" => "Danish",
|
||||||
|
"nl" => "Dutch",
|
||||||
|
"en" => "English",
|
||||||
|
"et" => "Estonian",
|
||||||
|
"fi" => "Finnish",
|
||||||
|
"fr" => "French",
|
||||||
|
"de" => "German",
|
||||||
|
"he" => "Hebrew",
|
||||||
|
"hu" => "Hungarian",
|
||||||
|
"is" => "Icelandic",
|
||||||
|
"it" => "Italian",
|
||||||
|
"jp" => "Japanese",
|
||||||
|
"ko" => "Korean",
|
||||||
|
"lv" => "Latvian",
|
||||||
|
"lt" => "Lithuanian",
|
||||||
|
"nb" => "Norwegian",
|
||||||
|
"pl" => "Polish",
|
||||||
|
"pt" => "Portuguese",
|
||||||
|
"ro" => "Romanian",
|
||||||
|
"ru" => "Russian",
|
||||||
|
"sr" => "Serbian",
|
||||||
|
"sk" => "Slovak",
|
||||||
|
"sl" => "Slovenian",
|
||||||
|
"es" => "Spanish",
|
||||||
|
"sv" => "Swedish",
|
||||||
|
"tr" => "Turkish"
|
||||||
|
]
|
||||||
|
],
|
||||||
|
"time" => [ // &lastUpdated=
|
||||||
|
"display" => "Time posted",
|
||||||
|
"option" => [
|
||||||
|
"any" => "Any time",
|
||||||
|
"d" => "Past day",
|
||||||
|
"w" => "Past week",
|
||||||
|
"m" => "Past month",
|
||||||
|
"y" => "Past year"
|
||||||
|
]
|
||||||
|
]
|
||||||
|
];
|
||||||
|
}
|
||||||
|
|
||||||
|
private function get($proxy, $url, $get = []){
|
||||||
|
|
||||||
|
$curlproc = curl_init();
|
||||||
|
|
||||||
|
if($get !== []){
|
||||||
|
$get = http_build_query($get);
|
||||||
|
$url .= "?" . $get;
|
||||||
|
}
|
||||||
|
|
||||||
|
curl_setopt($curlproc, CURLOPT_URL, $url);
|
||||||
|
|
||||||
|
// http2 bypass
|
||||||
|
curl_setopt($curlproc, CURLOPT_HTTP_VERSION, CURL_HTTP_VERSION_2_0);
|
||||||
|
|
||||||
|
curl_setopt($curlproc, CURLOPT_ENCODING, ""); // default encoding
|
||||||
|
curl_setopt($curlproc, CURLOPT_HTTPHEADER,
|
||||||
|
["User-Agent: " . config::USER_AGENT,
|
||||||
|
"Accept: */*",
|
||||||
|
"Accept-Language: en-US,en;q=0.5",
|
||||||
|
"Accept-Encoding: gzip, deflate, br, zstd",
|
||||||
|
"Referer: https://leta.mullvad.net/search",
|
||||||
|
"DNT: 1",
|
||||||
|
"Sec-GPC: 1",
|
||||||
|
"Connection: keep-alive",
|
||||||
|
"Cookie: engine=brave",
|
||||||
|
"Sec-Fetch-Dest: empty",
|
||||||
|
"Sec-Fetch-Mode: cors",
|
||||||
|
"Sec-Fetch-Site: same-origin",
|
||||||
|
"Priority: u=0",
|
||||||
|
"TE: trailers"]
|
||||||
|
);
|
||||||
|
|
||||||
|
curl_setopt($curlproc, CURLOPT_RETURNTRANSFER, true);
|
||||||
|
curl_setopt($curlproc, CURLOPT_SSL_VERIFYHOST, 2);
|
||||||
|
curl_setopt($curlproc, CURLOPT_SSL_VERIFYPEER, true);
|
||||||
|
curl_setopt($curlproc, CURLOPT_CONNECTTIMEOUT, 30);
|
||||||
|
curl_setopt($curlproc, CURLOPT_TIMEOUT, 30);
|
||||||
|
|
||||||
|
$this->backend->assign_proxy($curlproc, $proxy);
|
||||||
|
|
||||||
|
$data = curl_exec($curlproc);
|
||||||
|
|
||||||
|
if(curl_errno($curlproc)){
|
||||||
|
|
||||||
|
throw new Exception(curl_error($curlproc));
|
||||||
|
}
|
||||||
|
|
||||||
|
curl_close($curlproc);
|
||||||
|
return $data;
|
||||||
|
}
|
||||||
|
|
||||||
|
public function web($get){
|
||||||
|
|
||||||
|
if($get["npt"]){
|
||||||
|
|
||||||
|
[$params, $proxy] = $this->backend->get($get["npt"], "web");
|
||||||
|
$params = json_decode($params, true);
|
||||||
|
|
||||||
|
}else{
|
||||||
|
|
||||||
|
if(strlen($get["s"]) === 0){
|
||||||
|
|
||||||
|
throw new Exception("Search term is empty!");
|
||||||
|
}
|
||||||
|
|
||||||
|
// generate filters
|
||||||
|
$params = [
|
||||||
|
"q" => $get["s"],
|
||||||
|
"engine" => $this->engine,
|
||||||
|
"page" => 1
|
||||||
|
];
|
||||||
|
|
||||||
|
if($get["country"] != "any"){
|
||||||
|
|
||||||
|
$params["country"] = $get["country"];
|
||||||
|
}
|
||||||
|
|
||||||
|
if($get["language"] != "any"){
|
||||||
|
|
||||||
|
$params["language"] = $get["language"];
|
||||||
|
}
|
||||||
|
|
||||||
|
if($get["time"] != "any"){
|
||||||
|
|
||||||
|
$params["lastUpdated"] = $get["time"];
|
||||||
|
}
|
||||||
|
|
||||||
|
$proxy = $this->backend->get_ip();
|
||||||
|
}
|
||||||
|
|
||||||
|
try{
|
||||||
|
$json = $this->get(
|
||||||
|
$proxy,
|
||||||
|
"https://leta.mullvad.net/search/__data.json",
|
||||||
|
$params
|
||||||
|
);
|
||||||
|
}catch(Exception $error){
|
||||||
|
|
||||||
|
throw new Exception("Failed to fetch search page");
|
||||||
|
}
|
||||||
|
|
||||||
|
$json = json_decode($json, true);
|
||||||
|
|
||||||
|
if($json === null){
|
||||||
|
|
||||||
|
throw new Exception("Failed to decode JSON");
|
||||||
|
}
|
||||||
|
|
||||||
|
if(!isset($json["nodes"])){
|
||||||
|
|
||||||
|
throw new Exception("Mullvad did not return a nodes object");
|
||||||
|
}
|
||||||
|
|
||||||
|
$out = [
|
||||||
|
"status" => "ok",
|
||||||
|
"spelling" => [
|
||||||
|
"type" => "no_correction",
|
||||||
|
"using" => null,
|
||||||
|
"correction" => null
|
||||||
|
],
|
||||||
|
"npt" => $nextpage,
|
||||||
|
"answer" => [],
|
||||||
|
"web" => [],
|
||||||
|
"image" => [],
|
||||||
|
"video" => [],
|
||||||
|
"news" => [],
|
||||||
|
"related" => []
|
||||||
|
];
|
||||||
|
|
||||||
|
// parse json payload
|
||||||
|
foreach($json["nodes"] as $node){
|
||||||
|
|
||||||
|
if(!isset($node["data"][0]["q"])){
|
||||||
|
|
||||||
|
// not iterating through the query object
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
// node 0 contains pointers to what we need to iterate through
|
||||||
|
$node0 = &$node["data"][0];
|
||||||
|
|
||||||
|
if(!isset($node["data"][$node0["success"]])){
|
||||||
|
|
||||||
|
throw new Exception("Mullvad did not return a success object");
|
||||||
|
}
|
||||||
|
|
||||||
|
$success = &$node["data"][$node0["success"]];
|
||||||
|
|
||||||
|
if($success === false){
|
||||||
|
|
||||||
|
throw new Exception("Mullvad flagged the response as unsuccessful");
|
||||||
|
}
|
||||||
|
|
||||||
|
if(!isset($node["data"][$node0["items"]])){
|
||||||
|
|
||||||
|
throw new Exception("Mullvad did not return an items object");
|
||||||
|
}
|
||||||
|
|
||||||
|
$search_pointers = &$node["data"][$node0["items"]];
|
||||||
|
|
||||||
|
//
|
||||||
|
// Iterate over results
|
||||||
|
//
|
||||||
|
foreach($search_pointers as $pointer){
|
||||||
|
|
||||||
|
$pointer = &$node["data"][$pointer];
|
||||||
|
|
||||||
|
$link = &$node["data"][$pointer["link"]];
|
||||||
|
$title = &$node["data"][$pointer["title"]];
|
||||||
|
$description = &$node["data"][$pointer["snippet"]];
|
||||||
|
|
||||||
|
$date = null;
|
||||||
|
if($this->engine == "google"){
|
||||||
|
|
||||||
|
// attempt to extract date
|
||||||
|
// Jan 12, 2017
|
||||||
|
$date_parts = explode(" ... ", $description, 2);
|
||||||
|
|
||||||
|
if(
|
||||||
|
count($date_parts) === 2 &&
|
||||||
|
strlen($date_parts[0]) < 15
|
||||||
|
){
|
||||||
|
|
||||||
|
$date = strtotime(trim($date_parts[0]));
|
||||||
|
|
||||||
|
if($date === false){
|
||||||
|
|
||||||
|
$date = null;
|
||||||
|
}else{
|
||||||
|
|
||||||
|
$description = trim($date_parts[1]);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
$out["web"][] = [
|
||||||
|
"title" => $this->titledots($title),
|
||||||
|
"description" => $this->titledots($description),
|
||||||
|
"url" => $link,
|
||||||
|
"date" => $date,
|
||||||
|
"type" => "web",
|
||||||
|
"thumb" => [
|
||||||
|
"url" => null,
|
||||||
|
"ratio" => null
|
||||||
|
],
|
||||||
|
"sublink" => [],
|
||||||
|
"table" => []
|
||||||
|
];
|
||||||
|
}
|
||||||
|
|
||||||
|
//
|
||||||
|
// Get nextpage
|
||||||
|
//
|
||||||
|
if(isset($node["data"][$node0["next"]])){
|
||||||
|
|
||||||
|
$params["page"] = (int)$node["data"][$node0["next"]];
|
||||||
|
|
||||||
|
$out["npt"] =
|
||||||
|
$this->backend->store(
|
||||||
|
json_encode($params),
|
||||||
|
"web",
|
||||||
|
$proxy
|
||||||
|
);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return $out;
|
||||||
|
}
|
||||||
|
|
||||||
|
private function titledots($title){
|
||||||
|
|
||||||
|
return trim($title, " .\t\n\r\0\x0B…");
|
||||||
|
}
|
||||||
|
}
|
20
scraper/mullvad_brave.php
Normal file
20
scraper/mullvad_brave.php
Normal file
@@ -0,0 +1,20 @@
|
|||||||
|
<?php
|
||||||
|
|
||||||
|
class mullvad_brave{
|
||||||
|
|
||||||
|
public function __construct(){
|
||||||
|
|
||||||
|
include "scraper/mullvad.php";
|
||||||
|
$this->mullvad = new mullvad("brave");
|
||||||
|
}
|
||||||
|
|
||||||
|
public function getfilters($page){
|
||||||
|
|
||||||
|
return $this->mullvad->getfilters($page);
|
||||||
|
}
|
||||||
|
|
||||||
|
public function web($get){
|
||||||
|
|
||||||
|
return $this->mullvad->web($get);
|
||||||
|
}
|
||||||
|
}
|
20
scraper/mullvad_google.php
Normal file
20
scraper/mullvad_google.php
Normal file
@@ -0,0 +1,20 @@
|
|||||||
|
<?php
|
||||||
|
|
||||||
|
class mullvad_google{
|
||||||
|
|
||||||
|
public function __construct(){
|
||||||
|
|
||||||
|
include "scraper/mullvad.php";
|
||||||
|
$this->mullvad = new mullvad("google");
|
||||||
|
}
|
||||||
|
|
||||||
|
public function getfilters($page){
|
||||||
|
|
||||||
|
return $this->mullvad->getfilters($page);
|
||||||
|
}
|
||||||
|
|
||||||
|
public function web($get){
|
||||||
|
|
||||||
|
return $this->mullvad->web($get);
|
||||||
|
}
|
||||||
|
}
|
20
settings.php
20
settings.php
@@ -125,6 +125,10 @@ $settings = [
|
|||||||
"value" => "brave",
|
"value" => "brave",
|
||||||
"text" => "Brave"
|
"text" => "Brave"
|
||||||
],
|
],
|
||||||
|
[
|
||||||
|
"value" => "mullvad_brave",
|
||||||
|
"text" => "Mullvad (Brave)"
|
||||||
|
],
|
||||||
[
|
[
|
||||||
"value" => "yandex",
|
"value" => "yandex",
|
||||||
"text" => "Yandex"
|
"text" => "Yandex"
|
||||||
@@ -137,6 +141,10 @@ $settings = [
|
|||||||
"value" => "google_cse",
|
"value" => "google_cse",
|
||||||
"text" => "Google CSE"
|
"text" => "Google CSE"
|
||||||
],
|
],
|
||||||
|
[
|
||||||
|
"value" => "mullvad_google",
|
||||||
|
"text" => "Mullvad (Google)"
|
||||||
|
],
|
||||||
[
|
[
|
||||||
"value" => "startpage",
|
"value" => "startpage",
|
||||||
"text" => "Startpage"
|
"text" => "Startpage"
|
||||||
@@ -177,6 +185,10 @@ $settings = [
|
|||||||
"value" => "coccoc",
|
"value" => "coccoc",
|
||||||
"text" => "Cốc Cốc"
|
"text" => "Cốc Cốc"
|
||||||
],
|
],
|
||||||
|
[
|
||||||
|
"value" => "solofield",
|
||||||
|
"text" => "Solofield"
|
||||||
|
],
|
||||||
[
|
[
|
||||||
"value" => "marginalia",
|
"value" => "marginalia",
|
||||||
"text" => "Marginalia"
|
"text" => "Marginalia"
|
||||||
@@ -231,6 +243,10 @@ $settings = [
|
|||||||
"value" => "baidu",
|
"value" => "baidu",
|
||||||
"text" => "Baidu"
|
"text" => "Baidu"
|
||||||
],
|
],
|
||||||
|
[
|
||||||
|
"value" => "solofield",
|
||||||
|
"text" => "Solofield"
|
||||||
|
],
|
||||||
[
|
[
|
||||||
"value" => "pinterest",
|
"value" => "pinterest",
|
||||||
"text" => "Pinterest"
|
"text" => "Pinterest"
|
||||||
@@ -308,6 +324,10 @@ $settings = [
|
|||||||
[
|
[
|
||||||
"value" => "coccoc",
|
"value" => "coccoc",
|
||||||
"text" => "Cốc Cốc"
|
"text" => "Cốc Cốc"
|
||||||
|
],
|
||||||
|
[
|
||||||
|
"value" => "solofield",
|
||||||
|
"text" => "Solofield"
|
||||||
]
|
]
|
||||||
]
|
]
|
||||||
],
|
],
|
||||||
|
Reference in New Issue
Block a user