From f7499294de079344ad8546abfb68b5383bc34c5e Mon Sep 17 00:00:00 2001 From: lolcat Date: Wed, 30 Jul 2025 15:35:27 -0400 Subject: [PATCH] added cock cock, removed solofield --- lib/frontend.php | 10 +- scraper/coccoc.php | 672 +++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 678 insertions(+), 4 deletions(-) create mode 100644 scraper/coccoc.php diff --git a/lib/frontend.php b/lib/frontend.php index 954210d..99ef496 100644 --- a/lib/frontend.php +++ b/lib/frontend.php @@ -950,7 +950,8 @@ class frontend{ "mwmbl" => "Mwmbl", "mojeek" => "Mojeek", "baidu" => "Baidu", - "solofield" => "Solofield", + "coccoc" => "Cốc Cốc", + //"solofield" => "Solofield", "marginalia" => "Marginalia", "wiby" => "wiby", "curlie" => "Curlie" @@ -971,7 +972,7 @@ class frontend{ "qwant" => "Qwant", "yep" => "Yep", "baidu" => "Baidu", - "solofield" => "Solofield", + //"solofield" => "Solofield", "pinterest" => "Pinterest", "flickr" => "Flickr", "fivehpx" => "500px", @@ -996,7 +997,8 @@ class frontend{ "startpage" => "Startpage", "qwant" => "Qwant", "baidu" => "Baidu", - "solofield" => "Solofield" + "coccoc" => "Cốc Cốc" + //"solofield" => "Solofield" ] ]; break; @@ -1337,7 +1339,7 @@ class frontend{ return htmlspecialchars($image); } - return "/proxy?i=" . urlencode($image) . "&s=" . $format; + return "https://4get.ca/proxy?i=" . urlencode($image) . "&s=" . $format; } public function htmlnextpage($gets, $npt, $page){ diff --git a/scraper/coccoc.php b/scraper/coccoc.php new file mode 100644 index 0000000..fd09556 --- /dev/null +++ b/scraper/coccoc.php @@ -0,0 +1,672 @@ +backend = new backend("coccoc"); + + include "lib/fuckhtml.php"; + $this->fuckhtml = new fuckhtml(); + } + + + private function get($proxy, $url, $get = []){ + + $curlproc = curl_init(); + + if($get !== []){ + $get = http_build_query($get); + $url .= "?" . $get; + } + + curl_setopt($curlproc, CURLOPT_URL, $url); + + // http2 bypass + curl_setopt($curlproc, CURLOPT_HTTP_VERSION, CURL_HTTP_VERSION_2_0); + + curl_setopt($curlproc, CURLOPT_HTTPHEADER, [ + "Accept: text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8", + "Accept-Language: en-US,en;q=0.5", + "Accept-Encoding: gzip, deflate, br, zstd", + "DNT: 1", + "Sec-GPC: 1", + "Connection: keep-alive", + //"Cookie: _contentAB_15040_vi=V-06_01; split_test_search=new_search; uid=L_bauXyZBY1B; vid=uCVQJQSTgb9QGT3o; ls=1753742684; serp_version=29223843/7621a70; savedS=direct", + "Upgrade-Insecure-Requests: 1", + "Sec-Fetch-Dest: document", + "Sec-Fetch-Mode: navigate", + "Sec-Fetch-Site: cross-site", + "Priority: u=0, i" + ]); + + $this->backend->assign_proxy($curlproc, $proxy); + + curl_setopt($curlproc, CURLOPT_ENCODING, ""); // default encoding + + curl_setopt($curlproc, CURLOPT_RETURNTRANSFER, true); + curl_setopt($curlproc, CURLOPT_SSL_VERIFYHOST, 2); + curl_setopt($curlproc, CURLOPT_SSL_VERIFYPEER, true); + curl_setopt($curlproc, CURLOPT_CONNECTTIMEOUT, 30); + curl_setopt($curlproc, CURLOPT_TIMEOUT, 30); + + $data = curl_exec($curlproc); + + if(curl_errno($curlproc)){ + throw new Exception(curl_error($curlproc)); + } + + curl_close($curlproc); + return $data; + } + + public function getfilters($pagetype){ + + return [ + "nsfw" => [ + "display" => "NSFW", + "option" => [ + "yes" => "Yes", // nsfw by default???? + "no" => "No" // &safe=1 + ] + ], + "time" => [ + "display" => "Time posted", + "option" => [ + "any" => "Any time", + "1w" => "1 week ago", + "2w" => "2 weeks ago", + "1m" => "1 month ago", + "3m" => "3 months ago", + "6m" => "6 months ago", + "1Y" => "1 year ago" + ] + ], + "filter" => [ + "display" => "Remove duplicates", + "option" => [ + "no" => "No", + "yes" => "Yes" // &filter=0 + ] + ] + ]; + } + + public function web($get){ + + if($get["npt"]){ + + [$query, $proxy] = + $this->backend->get( + $get["npt"], + "web" + ); + + $query = json_decode($query, true); + }else{ + + $proxy = $this->backend->get_ip(); + + $query = [ + "query" => $get["s"] + ]; + + // add filters + if($get["nsfw"] == "no"){ + + $query["safe"] = 1; + } + + if($get["time"] != "any"){ + + $query["tbs"] = $get["time"]; + } + + if($get["filter"] == "yes"){ + + $query["filter"] = 0; + } + } + + try{ + + $html = + $this->get( + $proxy, + "https://coccoc.com/search", + $query + ); + }catch(Exception $error){ + + throw new Exception("Failed to get search page"); + } + //$html = file_get_contents("scraper/coccoc.html"); + + + $html = explode("window.composerResponse", $html, 2); + + if(count($html) !== 2){ + + throw new Exception("Failed to grep window.composerResponse"); + } + + $html = + json_decode( + $this->fuckhtml + ->extract_json( + ltrim($html[1], " =") + ), + true + ); + + if($html === null){ + + throw new Exception("Failed to decode JSON"); + } + + if(!isset($html["search"]["search_results"])){ + + throw new Exception("Coc Coc did not return a search_results object"); + } + + $out = [ + "status" => "ok", + "spelling" => [ + "type" => "no_correction", + "using" => null, + "correction" => null + ], + "npt" => null, + "answer" => [], + "web" => [], + "image" => [], + "video" => [], + "news" => [], + "related" => [] + ]; + + // word correction + foreach($html["top"] as $element){ + + if(isset($element["spellChecker"][0]["query"])){ + + $out["spelling"] = [ + "type" => "not_many", + "using" => $html["search"]["query"], + "correction" => $element["spellChecker"][0]["query"] + ]; + } + } + + foreach($html["search"]["search_results"] as $result){ + + if(isset($result["type"])){ + + switch($result["type"]){ + + // + // Related searches + // + case "related_queries": + $out["related"] = $result["queries"]; + continue 2; + + // + // Videos + // + case "video_hits": + foreach($result["results"] as $video){ + + if( + isset($video["image_url"]) && + !empty($video["image_url"]) + ){ + + $thumb = [ + "ratio" => "16:9", + "url" => $video["image_url"] + ]; + }else{ + + $thumb = [ + "ratio" => null, + "url" => null + ]; + } + + $out["video"][] = [ + "title" => + $this->titledots( + $this->fuckhtml + ->getTextContent( + $video["title"] + ) + ), + "description" => null, + "author" => [ + "name" => $video["uploader"], + "url" => null, + "avatar" => null + ], + "date" => (int)$video["date"], + "duration" => (int)$video["duration"], + "views" => null, + "thumb" => $thumb, + "url" => $video["url"] + ]; + } + continue 2; + } + } + + if( + !isset($result["title"]) || + !isset($result["url"]) + ){ + + // should not happen + continue; + } + + if(isset($result["rich"]["data"]["image_url"])){ + + $thumb = [ + "url" => $result["rich"]["data"]["image_url"], + "ratio" => "16:9" + ]; + }else{ + + $thumb = [ + "url" => null, + "ratio" => null + ]; + } + + $sublinks = []; + + if(isset($result["rich"]["data"]["linked_docs"])){ + + foreach($result["rich"]["data"]["linked_docs"] as $sub){ + + $sublinks[] = [ + "title" => + $this->titledots( + $this->fuckhtml + ->getTextContent( + $sub["title"] + ) + ), + "description" => + $this->titledots( + $this->fuckhtml + ->getTextContent( + $sub["content"] + ) + ), + "date" => null, + "url" => $sub["url"] + ]; + } + } + + // get date + if(isset($result["date"])){ + + $date = (int)$result["date"]; + }else{ + + $date = null; + } + + // probe for metadata + $table = []; + + if(isset($result["rich"]["data"]["rating"])){ + + $table["Rating"] = $result["rich"]["data"]["rating"]; + + if(isset($result["rich"]["data"]["num_rating"])){ + + $table["Rating"] .= " (" . number_format($result["rich"]["data"]["num_rating"]) . " ratings)"; + } + } + + if(isset($result["rich"]["data"]["views"])){ + + $table["Views"] = number_format($result["rich"]["data"]["views"]); + } + + if(isset($result["rich"]["data"]["duration"])){ + + $table["Duration"] = $this->int2hms($result["rich"]["data"]["duration"]); + } + + if(isset($result["rich"]["data"]["channel_name"])){ + + $table["Author"] = $result["rich"]["data"]["channel_name"]; + } + + if(isset($result["rich"]["data"]["video_quality"])){ + + $table["Quality"] = $result["rich"]["data"]["video_quality"]; + } + + if(isset($result["rich"]["data"]["category"])){ + + $table["Category"] = $result["rich"]["data"]["category"]; + } + + $out["web"][] = [ + "title" => + $this->titledots( + $this->fuckhtml + ->getTextContent( + $result["title"] + ) + ), + "description" => + $this->titledots( + $this->fuckhtml + ->getTextContent( + $result["content"] + ) + ), + "url" => $result["url"], + "date" => $date, + "type" => "web", + "thumb" => $thumb, + "sublink" => $sublinks, + "table" => $table + ]; + } + + // + // Get wikipedia head + // + if(isset($html["right"])){ + + foreach($html["right"] as $wiki){ + + $description = []; + + if(isset($wiki["short_intro"])){ + + $description[] = + [ + "type" => "quote", + "value" => $wiki["short_intro"], + ]; + } + + if(isset($wiki["intro"])){ + + $description[] = + [ + "type" => "text", + "value" => $wiki["intro"], + ]; + } + + // get table elements + $table = []; + + if(isset($wiki["fields"])){ + + foreach($wiki["fields"] as $element){ + + $table[$element["title"]] = implode(", ", $element["value"]); + } + } + + // get sublinks + $sublinks = []; + + if(isset($wiki["website"])){ + + if( + preg_match( + '/^http/', + $wiki["website"] + ) === 0 + ){ + + $sublinks["Website"] = "https://" . $wiki["website"]; + }else{ + + $sublinks["Website"] = $wiki["website"]; + } + } + + foreach($wiki["profiles"] as $sitename => $url){ + + $sitename = explode("_", $sitename); + $sitename = ucfirst($sitename[count($sitename) - 1]); + + $sublinks[$sitename] = $url; + } + + $out["answer"][] = [ + "title" => + $this->titledots( + $wiki["title"] + ), + "description" => $description, + "url" => null, + "thumb" => isset($wiki["image"]["contentUrl"]) ? $wiki["image"]["contentUrl"] : null, + "table" => $table, + "sublink" => $sublinks + ]; + } + } + + // get next page + if((int)$html["search"]["page"] < (int)$html["search"]["max_page"]){ + + // https://coccoc.com/composer?_=1754021153532&p=0&q=zbabduiqwhduwqhdnwq&reqid=bwcAs00q&s=direct&apiV=1 + // ^json endpoint, but we can just do &page=2 lol + + if(!isset($query["page"])){ + + $query["page"] = 2; + }else{ + + $query["page"]++; + } + + $out["npt"] = + $this->backend + ->store( + json_encode($query), + "web", + $proxy + ); + } + + return $out; + } + + public function video($get){ + + //$html = file_get_contents("scraper/coccoc.html"); + if($get["npt"]){ + + [$query, $proxy] = + $this->backend->get( + $get["npt"], + "videos" + ); + + $query = json_decode($query, true); + }else{ + + $proxy = $this->backend->get_ip(); + + $query = [ + "query" => $get["s"], + "tbm" => "vid" + ]; + + // add filters + if($get["nsfw"] == "no"){ + + $query["safe"] = 1; + } + + if($get["time"] != "any"){ + + $query["tbs"] = $get["time"]; + } + + if($get["filter"] == "yes"){ + + $query["filter"] = 0; + } + } + + try{ + + $html = + $this->get( + $proxy, + "https://coccoc.com/search", + $query + ); + }catch(Exception $error){ + + throw new Exception("Failed to get search page"); + } + + $html = explode("window.composerResponse", $html, 2); + + if(count($html) !== 2){ + + throw new Exception("Failed to grep window.composerResponse"); + } + + $html = + json_decode( + $this->fuckhtml + ->extract_json( + ltrim($html[1], " =") + ), + true + ); + + if($html === null){ + + throw new Exception("Failed to decode JSON"); + } + + $out = [ + "status" => "ok", + "npt" => null, + "video" => [], + "author" => [], + "livestream" => [], + "playlist" => [], + "reel" => [] + ]; + + if(!isset($html["search_video"]["search_results"])){ + + if(isset($html["search_video"]["error"]["title"])){ + + if($html["search_video"]["error"]["title"] == "Không tìm thấy kết quả nào"){ + + return $out; + } + + throw new Exception("Coc Coc returned an error: " . $html["search_video"]["error"]["title"]); + } + + throw new Exception("Coc Coc did not supply a search_results object"); + } + + foreach($html["search_video"]["search_results"] as $video){ + + if(isset($video["rich"]["data"]["image_url"])){ + + $thumb = [ + "ratio" => "16:9", + "url" => $video["rich"]["data"]["image_url"] + ]; + }else{ + + $thumb = [ + "ratio" => null, + "url" => null + ]; + } + + $out["video"][] = [ + "title" => + $this->titledots( + $this->fuckhtml + ->getTextContent( + $video["title"] + ) + ), + "description" => + $this->titledots( + $this->fuckhtml + ->getTextContent( + $video["content"] + ) + ), + "author" => [ + "name" => + isset($video["rich"]["data"]["channel_name"]) ? + $video["rich"]["data"]["channel_name"] : null, + "url" => null, + "avatar" => null + ], + "date" => + isset($video["date"]) ? + $video["date"] : null, + "duration" => + isset($video["rich"]["data"]["duration"]) ? + (int)$video["rich"]["data"]["duration"] : null, + "views" => null, + "thumb" => $thumb, + "url" => $video["url"] + ]; + } + + // get next page + if((int)$html["search_video"]["page"] < (int)$html["search_video"]["max_page"]){ + + if(!isset($query["page"])){ + + $query["page"] = 2; + }else{ + + $query["page"]++; + } + + $out["npt"] = + $this->backend + ->store( + json_encode($query), + "videos", + $proxy + ); + } + + return $out; + } + + private function titledots($title){ + + return trim($title, " .\t\n\r\0\x0B…"); + } + + private function int2hms($seconds){ + + $hours = floor($seconds / 3600); + $minutes = floor(($seconds % 3600) / 60); + $seconds = $seconds % 60; + + return sprintf("%02d:%02d:%02d", $hours, $minutes, $seconds); + } +}