added mwmbl scraper
This commit is contained in:
parent
8944ca6894
commit
a20d4de1e4
|
@ -104,6 +104,7 @@ class config{
|
||||||
const PROXY_PINTEREST = false;
|
const PROXY_PINTEREST = false;
|
||||||
const PROXY_SEZNAM = false;
|
const PROXY_SEZNAM = false;
|
||||||
const PROXY_NAVER = false;
|
const PROXY_NAVER = false;
|
||||||
|
const PROXY_MWMBL = false;
|
||||||
const PROXY_FTM = false; // findthatmeme
|
const PROXY_FTM = false; // findthatmeme
|
||||||
const PROXY_IMGUR = false;
|
const PROXY_IMGUR = false;
|
||||||
const PROXY_YANDEX_W = false; // yandex web
|
const PROXY_YANDEX_W = false; // yandex web
|
||||||
|
|
|
@ -12,3 +12,5 @@ This guide assumes that there is already a configured webserver sitting on port
|
||||||
5. Restart the tor service using `service tor restart`
|
5. Restart the tor service using `service tor restart`
|
||||||
6. Wait for a while...
|
6. Wait for a while...
|
||||||
7. Run `cat /var/lib/tor/4get/hostname`. That is your onion address!
|
7. Run `cat /var/lib/tor/4get/hostname`. That is your onion address!
|
||||||
|
|
||||||
|
# Specify your own tor address
|
||||||
|
|
|
@ -902,6 +902,7 @@ class frontend{
|
||||||
"yandex" => "Yandex",
|
"yandex" => "Yandex",
|
||||||
"google" => "Google",
|
"google" => "Google",
|
||||||
"yep" => "Yep",
|
"yep" => "Yep",
|
||||||
|
"mwmbl" => "Mwmbl",
|
||||||
"mojeek" => "Mojeek",
|
"mojeek" => "Mojeek",
|
||||||
"marginalia" => "Marginalia",
|
"marginalia" => "Marginalia",
|
||||||
"wiby" => "wiby",
|
"wiby" => "wiby",
|
||||||
|
@ -1018,6 +1019,11 @@ class frontend{
|
||||||
$lib = new facebook();
|
$lib = new facebook();
|
||||||
break;*/
|
break;*/
|
||||||
|
|
||||||
|
case "mwmbl":
|
||||||
|
include "scraper/mwmbl.php";
|
||||||
|
$lib = new mwmbl();
|
||||||
|
break;
|
||||||
|
|
||||||
case "mojeek":
|
case "mojeek":
|
||||||
include "scraper/mojeek.php";
|
include "scraper/mojeek.php";
|
||||||
$lib = new mojeek();
|
$lib = new mojeek();
|
||||||
|
|
|
@ -538,8 +538,7 @@ class google{
|
||||||
$url .= "?" . $get;
|
$url .= "?" . $get;
|
||||||
}
|
}
|
||||||
|
|
||||||
//curl_setopt($curlproc, CURLOPT_URL, $url);
|
curl_setopt($curlproc, CURLOPT_URL, $url);
|
||||||
curl_setopt($curlproc, CURLOPT_URL, "https://ifconfig.co");
|
|
||||||
|
|
||||||
curl_setopt($curlproc, CURLOPT_ENCODING, ""); // default encoding
|
curl_setopt($curlproc, CURLOPT_ENCODING, ""); // default encoding
|
||||||
curl_setopt($curlproc, CURLOPT_HTTPHEADER, $headers);
|
curl_setopt($curlproc, CURLOPT_HTTPHEADER, $headers);
|
||||||
|
@ -560,7 +559,6 @@ class google{
|
||||||
}
|
}
|
||||||
|
|
||||||
curl_close($curlproc);
|
curl_close($curlproc);
|
||||||
echo $data;
|
|
||||||
return $data;
|
return $data;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -0,0 +1,168 @@
|
||||||
|
<?php
|
||||||
|
|
||||||
|
class mwmbl{
|
||||||
|
|
||||||
|
public function __construct(){
|
||||||
|
|
||||||
|
include "lib/backend.php";
|
||||||
|
$this->backend = new backend("mwmbl");
|
||||||
|
|
||||||
|
include "lib/fuckhtml.php";
|
||||||
|
$this->fuckhtml = new fuckhtml();
|
||||||
|
}
|
||||||
|
|
||||||
|
public function getfilters($page){
|
||||||
|
|
||||||
|
return [];
|
||||||
|
}
|
||||||
|
|
||||||
|
private function get($proxy, $url, $get = []){
|
||||||
|
|
||||||
|
$curlproc = curl_init();
|
||||||
|
|
||||||
|
if($get !== []){
|
||||||
|
$get = http_build_query($get);
|
||||||
|
$url .= "?" . $get;
|
||||||
|
}
|
||||||
|
|
||||||
|
curl_setopt($curlproc, CURLOPT_URL, $url);
|
||||||
|
|
||||||
|
curl_setopt($curlproc, CURLOPT_ENCODING, ""); // default encoding
|
||||||
|
curl_setopt($curlproc, CURLOPT_HTTPHEADER,
|
||||||
|
["User-Agent: " . config::USER_AGENT,
|
||||||
|
"Accept: text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8",
|
||||||
|
"Accept-Language: en-US,en;q=0.5",
|
||||||
|
"Accept-Encoding: gzip",
|
||||||
|
"DNT: 1",
|
||||||
|
"Connection: keep-alive",
|
||||||
|
"Upgrade-Insecure-Requests: 1",
|
||||||
|
"Sec-Fetch-Dest: document",
|
||||||
|
"Sec-Fetch-Mode: navigate",
|
||||||
|
"Sec-Fetch-Site: none",
|
||||||
|
"Sec-Fetch-User: ?1"]
|
||||||
|
);
|
||||||
|
|
||||||
|
curl_setopt($curlproc, CURLOPT_RETURNTRANSFER, true);
|
||||||
|
curl_setopt($curlproc, CURLOPT_SSL_VERIFYHOST, 2);
|
||||||
|
curl_setopt($curlproc, CURLOPT_SSL_VERIFYPEER, true);
|
||||||
|
curl_setopt($curlproc, CURLOPT_CONNECTTIMEOUT, 30);
|
||||||
|
curl_setopt($curlproc, CURLOPT_TIMEOUT, 30);
|
||||||
|
|
||||||
|
$this->backend->assign_proxy($curlproc, $proxy);
|
||||||
|
|
||||||
|
$data = curl_exec($curlproc);
|
||||||
|
|
||||||
|
if(curl_errno($curlproc)){
|
||||||
|
|
||||||
|
throw new Exception(curl_error($curlproc));
|
||||||
|
}
|
||||||
|
|
||||||
|
curl_close($curlproc);
|
||||||
|
return $data;
|
||||||
|
}
|
||||||
|
|
||||||
|
public function web($get){
|
||||||
|
|
||||||
|
$search = $get["s"];
|
||||||
|
if(strlen($search) === 0){
|
||||||
|
|
||||||
|
throw new Exception("Search term is empty!");
|
||||||
|
}
|
||||||
|
|
||||||
|
try{
|
||||||
|
$html = $this->get(
|
||||||
|
$this->backend->get_ip(), // no next page!
|
||||||
|
"https://mwmbl.org/app/home/",
|
||||||
|
[
|
||||||
|
"q" => $search
|
||||||
|
]
|
||||||
|
);
|
||||||
|
}catch(Exception $error){
|
||||||
|
|
||||||
|
throw new Exception("Failed to fetch HTML");
|
||||||
|
}
|
||||||
|
|
||||||
|
$out = [
|
||||||
|
"status" => "ok",
|
||||||
|
"spelling" => [
|
||||||
|
"type" => "no_correction",
|
||||||
|
"using" => null,
|
||||||
|
"correction" => null
|
||||||
|
],
|
||||||
|
"npt" => null,
|
||||||
|
"answer" => [],
|
||||||
|
"web" => [],
|
||||||
|
"image" => [],
|
||||||
|
"video" => [],
|
||||||
|
"news" => [],
|
||||||
|
"related" => []
|
||||||
|
];
|
||||||
|
|
||||||
|
$this->fuckhtml->load($html);
|
||||||
|
|
||||||
|
$results =
|
||||||
|
$this->fuckhtml
|
||||||
|
->getElementsByClassName(
|
||||||
|
"result",
|
||||||
|
"li"
|
||||||
|
);
|
||||||
|
|
||||||
|
foreach($results as $result){
|
||||||
|
|
||||||
|
$this->fuckhtml->load($result);
|
||||||
|
|
||||||
|
$p =
|
||||||
|
$this->fuckhtml
|
||||||
|
->getElementsByTagName("p");
|
||||||
|
|
||||||
|
$out["web"][] = [
|
||||||
|
"title" =>
|
||||||
|
$this->titledots(
|
||||||
|
$this->fuckhtml
|
||||||
|
->getTextContent(
|
||||||
|
$this->fuckhtml
|
||||||
|
->getElementsByClassName(
|
||||||
|
"title",
|
||||||
|
$p
|
||||||
|
)[0]
|
||||||
|
)
|
||||||
|
),
|
||||||
|
"description" =>
|
||||||
|
$this->titledots(
|
||||||
|
$this->fuckhtml
|
||||||
|
->getTextContent(
|
||||||
|
$this->fuckhtml
|
||||||
|
->getElementsByClassName(
|
||||||
|
"extract",
|
||||||
|
$p
|
||||||
|
)[0]
|
||||||
|
)
|
||||||
|
),
|
||||||
|
"url" =>
|
||||||
|
$this->fuckhtml
|
||||||
|
->getTextContent(
|
||||||
|
$this->fuckhtml
|
||||||
|
->getElementsByTagName("a")
|
||||||
|
[0]
|
||||||
|
["attributes"]
|
||||||
|
["href"]
|
||||||
|
),
|
||||||
|
"date" => null,
|
||||||
|
"type" => "web",
|
||||||
|
"thumb" => [
|
||||||
|
"url" => null,
|
||||||
|
"ratio" => null
|
||||||
|
],
|
||||||
|
"sublink" => [],
|
||||||
|
"table" => []
|
||||||
|
];
|
||||||
|
}
|
||||||
|
|
||||||
|
return $out;
|
||||||
|
}
|
||||||
|
|
||||||
|
private function titledots($title){
|
||||||
|
|
||||||
|
return rtrim($title, "…");
|
||||||
|
}
|
||||||
|
}
|
|
@ -125,6 +125,10 @@ $settings = [
|
||||||
"value" => "yep",
|
"value" => "yep",
|
||||||
"text" => "Yep"
|
"text" => "Yep"
|
||||||
],
|
],
|
||||||
|
[
|
||||||
|
"value" => "mwmbl",
|
||||||
|
"text" => "Mwmbl"
|
||||||
|
],
|
||||||
[
|
[
|
||||||
"value" => "mojeek",
|
"value" => "mojeek",
|
||||||
"text" => "Mojeek"
|
"text" => "Mojeek"
|
||||||
|
|
Loading…
Reference in New Issue