nextpage = new nextpage("fb"); include "lib/proxy_pool.php"; $this->proxy = new proxy_pool("facebook"); } public function getfilters($page){ return [ "sort" => [ "display" => "Sort by", "option" => [ "relevance" => "Relevance", "most_recent" => "Most recent" ] ], "newer" => [ "display" => "Newer than", "option" => "_DATE" ], "older" => [ "display" => "Older than", "option" => "_DATE" ], "live" => [ "display" => "Livestream", "option" => [ "no" => "No", "yes" => "Yes" ] ] ]; } private function get($url, $get = [], $reqtype = self::get){ $curlproc = curl_init(); if($get !== []){ $get = http_build_query($get); if($reqtype === self::get){ $headers = [ "User-Agent: Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:115.0) Gecko/20100101 Firefox/115.0", "Accept: text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8", "Accept-Language: en-US,en;q=0.5", "Accept-Encoding: gzip", "DNT: 1", "Connection: keep-alive", "Upgrade-Insecure-Requests: 1", "Sec-Fetch-Dest: document", "Sec-Fetch-Mode: navigate", "Sec-Fetch-Site: none", "Sec-Fetch-User: ?1" ]; $url .= "?" . $get; }else{ curl_setopt($curlproc, CURLOPT_HTTP_VERSION, CURL_HTTP_VERSION_2_0); $headers = [ "User-Agent: Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:115.0) Gecko/20100101 Firefox/115.0", "Accept: */*", "Accept-Language: en-US,en;q=0.5", "Accept-Encoding: gzip, deflate, br", "Content-Type: application/x-www-form-urlencoded", "X-FB-Friendly-Name: SearchCometResultsPaginatedResultsQuery", //"X-FB-LSD: AVptQC4a16c", //"X-ASBD-ID: 129477", "Content-Length: " . strlen($get), "Origin: https://www.facebook.com", "DNT: 1", "Connection: keep-alive", "Referer: https://www.facebook.com/watch/", "Cookie: datr=__GMZCgwVF5BbyvAtfJojQwg; oo=v1%7C3%3A1691641171; wd=955x995", "Sec-Fetch-Dest: empty", "Sec-Fetch-Mode: cors", "Sec-Fetch-Site: same-origin", "TE: trailers" ]; curl_setopt($curlproc, CURLOPT_POST, true); curl_setopt($curlproc, CURLOPT_POSTFIELDS, $get); } } curl_setopt($curlproc, CURLOPT_URL, $url); curl_setopt($curlproc, CURLOPT_ENCODING, ""); // default encoding curl_setopt($curlproc, CURLOPT_HTTPHEADER, $headers); curl_setopt($curlproc, CURLOPT_RETURNTRANSFER, true); curl_setopt($curlproc, CURLOPT_SSL_VERIFYHOST, 2); curl_setopt($curlproc, CURLOPT_SSL_VERIFYPEER, true); curl_setopt($curlproc, CURLOPT_CONNECTTIMEOUT, 30); curl_setopt($curlproc, CURLOPT_TIMEOUT, 30); $this->proxy->assign_proxy($curlproc); $data = curl_exec($curlproc); if(curl_errno($curlproc)){ throw new Exception(curl_error($curlproc)); } curl_close($curlproc); return $data; } public function video($get){ $search = $get["s"]; $npt = $get["npt"]; $this->out = [ "status" => "ok", "npt" => null, "video" => [], "author" => [], "livestream" => [], "playlist" => [], "reel" => [] ]; if($get["npt"]){ $nextpage = json_decode( $this->nextpage->get( $npt, "videos" ), true ); // parse next page $this->video_nextpage($nextpage); return $this->out; } // generate filter data // { // "rp_creation_time:0":"{\"name\":\"creation_time\",\"args\":\"{\\\"start_year\\\":\\\"2023\\\",\\\"start_month\\\":\\\"2023-08\\\",\\\"end_year\\\":\\\"2023\\\",\\\"end_month\\\":\\\"2023-08\\\",\\\"start_day\\\":\\\"2023-08-10\\\",\\\"end_day\\\":\\\"2023-08-10\\\"}\"}", // "videos_sort_by:0":"{\"name\":\"videos_sort_by\",\"args\":\"Most Recent\"}", // "videos_live:0":"{\"name\":\"videos_live\",\"args\":\"\"}" // } $filter = []; $sort = $get["sort"]; $live = $get["live"]; $older = $get["older"]; $newer = $get["newer"]; if( $older !== false || $newer !== false ){ if($older === false){ $older = time(); } if($newer === false){ $newer = 0; } $filter["rp_creation_time:0"] = json_encode( [ "name" => "creation_time", "args" => json_encode( [ "start_year" => date("Y", $newer), "start_month" => date("Y-m", $newer), "end_year" => date("Y", $older), "end_month" => date("Y-m", $older), "start_day" => date("Y-m-d", $newer), "end_day" => date("Y-m-d", $older) ] ) ] ); } if($sort != "relevance"){ $filter["videos_sort_by:0"] = json_encode( [ "name" => "videos_sort_by", "args" => "Most Recent" ] ); } if($live != "no"){ $filter["videos_live:0"] = json_encode( [ "name" => "videos_live", "args" => "" ] ); } $req = [ "q" => $search ]; if(count($filter) !== 0){ $req["filters"] = base64_encode( json_encode( $filter ) ); } /* $html = $this->get( "https://www.facebook.com/watch/search/", $req );*/ $handle = fopen("scraper/facebook.html", "r"); $html = fread($handle, filesize("scraper/facebook.html")); fclose($handle); preg_match_all( '/({"__bbox":.*,"sequence_number":0}})\]\]/', $html, $json ); if(!isset($json[1][1])){ throw new Exception("Could not grep JSON body"); } $json = json_decode($json[1][1], true); foreach( $json ["__bbox"] ["result"] ["data"] ["serpResponse"] ["results"] ["edges"] as $result ){ $this->parse_edge($result); } // get nextpage data if( $json ["__bbox"] ["result"] ["data"] ["serpResponse"] ["results"] ["page_info"] ["has_next_page"] == 1 ){ preg_match( '/handleWithCustomApplyEach\(ScheduledApplyEach,({.*})\);}\);}\);<\/script>/', $html, $nextpagedata ); // [POST] https://www.facebook.com/api/graphql/ // FORM data, not JSON! $nextpage = [ "av" => "0", "__user" => null, "__a" => null, "__req" => "2", "__hs" => null, "dpr" => "1", "__ccg" => null, "__rev" => null, // another client side token "__s" => $this->randomstring(6) . ":" . $this->randomstring(6) . ":" . $this->randomstring(6), "__hsi" => null, // tracking fingerprint (probably generated using webgl) "__dyn" => "7xeUmwlE7ibwKBWo2vwAxu13w8CewSwMwNw9G2S0im3y4o0B-q1ew65xO2O1Vw8G1Qw5Mx61vw9m1YwBgao6C0Mo5W3S7Udo5q4U2zxe2Gew9O222SUbEaU2eU5O0GpovU19pobodEGdw46wbS1LwTwNwLw8O1pwr86C16w", "__csr" => $this->randomstring(null), "__comet_req" => null, "lsd" => null, "jazoest" => null, "__spin_r" => null, "__spin_b" => null, "__spin_t" => null, "fb_api_caller_class" => "RelayModern", "fb_api_req_friendly_name" => "SearchCometResultsPaginatedResultsQuery", "variables" => [ // this is json "UFI2CommentsProvider_commentsKey" => "SearchCometResultsInitialResultsQuery", "allow_streaming" => false, "args" => [ "callsite" => "comet:watch_search", "config" => [ "exact_match" => false, "high_confidence_config" => null, "intercept_config" => null, "sts_disambiguation" => null, "watch_config" => null ], "context" => [ "bsid" => null, "tsid" => null ], "experience" => [ "encoded_server_defined_params" => null, "fbid" => null, "type" => "WATCH_TAB_GLOBAL" ], "filters" => [], "text" => $search ], "count" => 5, "cursor" => $json ["__bbox"] ["result"] ["data"] ["serpResponse"] ["results"] ["page_info"] ["end_cursor"], "displayCommentsContextEnableComment" => false, "displayCommentsContextIsAdPreview" => false, "displayCommentsContextIsAggregatedShare" => false, "displayCommentsContextIsStorySet" => false, "displayCommentsFeedbackContext" => null, "feedLocation" => "SEARCH", "feedbackSource" => 23, "fetch_filters" => true, "focusCommentID" => null, "locale" => null, "privacySelectorRenderLocation" => "COMET_STREAM", "renderLocation" => "search_results_page", "scale" => 1, "stream_initial_count" => 0, "useDefaultActor" => false, "__relay_internal__pv__IsWorkUserrelayprovider" => false, "__relay_internal__pv__IsMergQAPollsrelayprovider" => false, "__relay_internal__pv__StoriesArmadilloReplyEnabledrelayprovider" => false, "__relay_internal__pv__StoriesRingrelayprovider" => false ], "server_timestamps" => "true", "doc_id" => "6761275837251607" // is actually dynamic ]; // append filters to nextpage foreach($filter as $key => $value){ $nextpage["variables"]["args"]["filters"][] = $value; } $nextpagedata = json_decode($nextpagedata[1], true); // get bsid foreach($nextpagedata["require"] as $key){ foreach($key as $innerkey){ if(is_array($innerkey)){ foreach($innerkey as $inner_innerkey){ if(is_array($inner_innerkey)){ foreach($inner_innerkey as $inner_inner_innerkey){ if( isset( $inner_inner_innerkey ["variables"] ["args"] ["context"] ["bsid"] ) ){ $nextpage ["variables"] ["args"] ["context"] ["bsid"] = $inner_inner_innerkey ["variables"] ["args"] ["context"] ["bsid"]; } } } } } } } foreach($nextpagedata["define"] as $key){ if(isset($key[2]["haste_session"])){ $nextpage["__hs"] = $key[2]["haste_session"]; } if(isset($key[2]["connectionClass"])){ $nextpage["__ccg"] = $key[2]["connectionClass"]; } if(isset($key[2]["__spin_r"])){ $nextpage["__spin_r"] = (string)$key[2]["__spin_r"]; } if(isset($key[2]["hsi"])){ $nextpage["__hsi"] = (string)$key[2]["hsi"]; } if( isset($key[2]["token"]) && !empty($key[2]["token"]) ){ $nextpage["lsd"] = $key[2]["token"]; } if(isset($key[2]["__spin_r"])){ $nextpage["__spin_r"] = (string)$key[2]["__spin_r"]; $nextpage["__rev"] = $nextpage["__spin_r"]; } if(isset($key[2]["__spin_b"])){ $nextpage["__spin_b"] = $key[2]["__spin_b"]; } if(isset($key[2]["__spin_t"])){ $nextpage["__spin_t"] = (string)$key[2]["__spin_t"]; } } preg_match( '/{"u":"\\\\\/ajax\\\\\/qm\\\\\/\?__a=([0-9]+)&__user=([0-9]+)&__comet_req=([0-9]+)&jazoest=([0-9]+)"/', $html, $ajaxparams ); if(count($ajaxparams) !== 5){ throw new Exception("Could not grep the AJAX parameters"); } $nextpage["__a"] = $ajaxparams[1]; $nextpage["__user"] = $ajaxparams[2]; $nextpage["__comet_req"] = $ajaxparams[3]; $nextpage["jazoest"] = $ajaxparams[4]; /* $handle = fopen("scraper/facebook-nextpage.json", "r"); $json = fread($handle, filesize("scraper/facebook-nextpage.json")); fclose($handle);*/ $nextpage["variables"] = json_encode($nextpage["variables"]); $this->video_nextpage($nextpage); } return $this->out; } private function video_nextpage($nextpage, $getcursor = false){ $json = $this->get( "https://www.facebook.com/api/graphql/", $nextpage, self::post ); $json = json_decode($json, true); if($json === null){ throw new Exception("Failed to decode next page JSON"); } foreach( $json ["data"] ["serpResponse"] ["results"] ["edges"] as $result ){ $this->parse_edge($result); } if( $json ["data"] ["serpResponse"] ["results"] ["page_info"] ["has_next_page"] == 1 ){ $nextpage["variables"] = json_decode($nextpage["variables"], true); $nextpage["variables"]["cursor"] = $json ["data"] ["serpResponse"] ["results"] ["page_info"] ["end_cursor"]; $nextpage["variables"] = json_encode($nextpage["variables"]); //change this for second call. after, it's static. // TODO: csr also updates to longer string $nextpage["__dyn"] = "7xeUmwlEnwn8K2WnFw9-2i5U4e0yoW3q322aew9G2S0zU20xi3y4o0B-q1ew65xOfxO1Vw8G11xmfz81s8hwGwQw9m1YwBgao6C2O0B85W3S7Udo5qfK0EUjwGzE2swwwJK2W2K0zK5o4q0GpovU19pobodEGdw46wbS1LwTwNwLw8O1pwr86C16w"; // TODO: change this on third and 6th call //$nextpage["__s"] = $this->randomstring(6) . ":" . explode(":", $nextpage["__s"], 2)[1]; $this->out["npt"] = $this->nextpage->store(json_encode($nextpage), "videos"); } } private function parse_edge($edge){ $append = "video"; $edge = $edge ["relay_rendering_strategy"] ["view_model"]; if( strtolower( $edge ["video_metadata_model"] ["video_broadcast_status"] ) == "live" ){ // handle livestream $duration = "_LIVE"; $append = "livestream"; $timetext = null; $views = (int)$edge ["video_metadata_model"] ["relative_time_string"]; $url_prefix = "https://www.facebook.com/watch/live/?v="; }elseif( stripos( $edge ["video_metadata_model"] ["video_broadcast_status"], "vod" ) !== false ){ // handle VOD format $timetext = null; $views = (int)$edge ["video_metadata_model"] ["relative_time_string"]; $duration = $this->hms2int( $edge ["video_thumbnail_model"] ["video_duration_text"] ); $url_prefix = "https://www.facebook.com/watch/live/?v="; }else{ // handle normal format $timetext = explode( " ยท ", $edge ["video_metadata_model"] ["relative_time_string"], 2 ); if(count($timetext) === 2){ $views = $this->truncatedcount2int($timetext[1]); }else{ $views = null; } $timetext = strtotime($timetext[0]); $duration = $this->hms2int( $edge ["video_thumbnail_model"] ["video_duration_text"] ); $url_prefix = "https://www.facebook.com/watch/?v="; } if( isset( $edge ["video_metadata_model"] ["video_owner_profile"] ["uri_token"] ) ){ $profileurl = "https://www.facebook.com/watch/" . $edge ["video_metadata_model"] ["video_owner_profile"] ["uri_token"]; }else{ $profileurl = $edge ["video_metadata_model"] ["video_owner_profile"] ["url"]; } $this->out[$append][] = [ "title" => $this->limitstrlen( str_replace( "\n", " ", $edge ["video_metadata_model"] ["title"] ), 100 ), "description" => empty( $edge ["video_metadata_model"] ["save_description"] ) ? null : str_replace( "\n", " ", $this->limitstrlen( $edge ["video_metadata_model"] ["save_description"] ) ), "author" => [ "name" => $edge ["video_metadata_model"] ["video_owner_profile"] ["name"], "url" => $profileurl, "avatar" => null ], "date" => $timetext, "duration" => $duration, "views" => $views, "thumb" => [ "url" => $edge ["video_thumbnail_model"] ["thumbnail_image"] ["uri"], "ratio" => "16:9" ], "url" => $url_prefix . $edge ["video_click_model"] ["click_metadata_model"] ["video_id"] ]; } private function randomstring($len){ if($len === null){ $str = "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ123456789-"; $len = rand(141, 145); $c = 61; }else{ $str = "abcdefghijklmnopqrstuvwxyz123456789"; $c = 34; } $out = null; for($i=0; $i<$len; $i++){ $out .= $str[rand(0, $c)]; } return $out; } private function limitstrlen($text, $len = 300){ return explode("\n", wordwrap($text, $len, "\n"))[0]; } private function hms2int($time){ $parts = explode(":", $time, 3); $time = 0; if(count($parts) === 3){ // hours $time = $time + ((int)$parts[0] * 3600); array_shift($parts); } if(count($parts) === 2){ // minutes $time = $time + ((int)$parts[0] * 60); array_shift($parts); } // seconds $time = $time + (int)$parts[0]; return $time; } private function truncatedcount2int($number){ // decimal should always be 1 number long $number = explode(" ", $number, 2); $number = $number[0]; $unit = strtolower($number[strlen($number) - 1]); $tmp = explode(".", $number, 2); $number = (int)$number; if(count($tmp) === 2){ $decimal = (int)$tmp[1]; }else{ $decimal = 0; } switch($unit){ case "k": $exponant = 1000; break; case "m": $exponant = 1000000; break; case "b"; $exponant = 1000000000; break; default: $exponant = 1; break; } return ($number * $exponant) + ($decimal * ($exponant / 10)); } }