session_end(); die($this->__log($err)); } public function __construct($site_id) { $this->sites = []; $dbc = mysql_query("SELECT * FROM sites ORDER BY id"); while($rt = mysql_fetch_object($dbc)) $this->sites[$rt->id] = $rt; $site_id = (int) $site_id; $this->site = &$this->sites[$site_id]; if(!$this->site) $this->error("site (id = {$site_id}) not found"); if(!$this->site->is_active) $this->error("site «" . $this->site->name . "» (id = " . $this->site->id . ") disabled"); $this->site_id = $this->site->id; $this->cities = conf::$cities; $this->timeout = conf::$timeout; $this->mirrors = conf::$mirrors; $this->days = [ "вчера" => date("Y-m-d",time() - 86400), "сегодня" => date("Y-m-d"), ]; $this->datelimit = date("Y-m-d", time() - 2 * 86400) . " 00:00:00"; $this->month = [ "01" => "января", "02" => "февраля", "03" => "марта", "04" => "апреля", "05" => "мая", "06" => "июня", "07" => "июля", "08" => "августа", "09" => "сентября", "10" => "октября", "11" => "ноября", "12" => "декабря", ]; $this->year = date("Y"); $this->dS = 0.1; $this->deals = [ prodam => 1, sdam => 2, ]; $this->categories = [ kvartiry => 1, # Квартиры komnaty => 2, # Комнаты doma_dachi_kottedzhi => 3, # Дома, дачи, коттеджи zemelnye_uchastki => 4, # Земельные участки kommercheskaya_nedvizhimost => 5, # Коммерческая недвижимость garazhi_i_mashinomesta => 6, # Гаражи и машиноместа nedvizhimost_za_rubezhom => 7, # Недвижимость за рубежом ]; $this->__proxy = (object) [ip => "", port => ""]; $this->proxy = "10.0.0.1:8800"; list($this->__proxy->ip,$this->__proxy->port) = explode(":",$this->proxy); $this->cookies = "{$_ENV[ROOT]}/../logs/{$this->site->code}.cookie"; } public function update_proxy($n, $status = "", $error = "") { if(!$this->__proxy) return false; $status = mysql_real_escape_string($status); $error = mysql_real_escape_string($error); $sql = "UPDATE site_proxy SET date_check = NOW(), last_status = '{$status}', last_error = '{$error}', fails_counter = " . ($n ? "fails_counter + 1" : "0, success_counter = success_counter + 1, last_success = NOW()") . " WHERE site_id = '{$this->site->id}' AND proxy_id = '{$this->__proxy->id}'"; mysql_query($sql); } public function get_proxy($proxy = "", $reset = 0) { $this->__proxy = (object) [ip => "", port => ""]; $this->proxy = "10.0.0.1:8800"; list($this->__proxy->ip,$this->__proxy->port) = explode(":",$this->proxy); return $this->proxy; if($proxy){ $this->__proxy = mysql_fetch_object(mysql_query("SELECT * FROM proxy WHERE proxy = '" . mysql_real_escape_string($proxy) . "'")) or $this->error("proxy {$proxy} not found!"); } else { if($reset) mysql_query("UPDATE site_proxy SET fails_counter = 0, last_status = '' WHERE site_id = '{$this->site->id}' AND is_active"); $this->__proxy = mysql_fetch_object(mysql_query("SELECT p.* FROM proxy p INNER JOIN site_proxy s ON s.site_id = '{$this->site->id}' AND p.id = s.proxy_id WHERE s.is_active AND (s.last_status != '403' OR s.last_status = '403' AND s.date_check + INTERVAL 15 MINUTE < NOW()) ORDER BY RAND() LIMIT 1")); if(!$this->__proxy) return $reset ? $this->error("no active proxy in proxy-list!") : $this->get_proxy("", 1); } $this->proxy = $this->__proxy->proxy; list($this->__proxy->ip,$this->__proxy->port) = explode(":",$this->proxy); return $this->proxy; } public function get_date($date, $time = "") { if(!$time) $time = "00:00"; $time .= ":00"; $date = mb_strtolower(trim($date),"utf-8"); if($this->days[$date]) $date = $this->days[$date]; else{ preg_match("/^(\d*)(\D*)(\d*)$/i",$date,$date); if(!$date[3]) $date[3] = $this->year; elseif(mb_strlen($date[3],"utf-8") < 4) $date[3] = "20" . $date[3]; if(!isset($this->monthcache[$date[2]])){ $this->monthcache[$date[2]] = preg_replace("/\p{P}+$/","",trim($date[2])); foreach($this->month as $k => $v){ if(preg_match("/^" . preg_quote($this->monthcache[$date[2]],"/") . "/ui",$v)){ $this->monthcache[$date[2]] = $k; break; } } } if(mb_strlen($date[1],"utf-8") < 2) $date[1] = "0" . $date[1]; $date[2] = $this->monthcache[$date[2]]; if($date[2] > date("m") && $date[3] >= $this->year) $date[3] = $this->year - 1; $date = "{$date[3]}-{$date[2]}-{$date[1]}"; } return $date . " " . $time; } private function __get_url($url, $opts, $method) { sleep(4); $log = "get_url: {$method} : {$url}"; $this->__log($log); $params = [ 'METHOD' => $method, 'URL' => $url, 'OPTS' => serialize($opts), ]; $content = http_build_query($params); $context = stream_context_create([ http => [ "protocol_version" => 1.1, "method" => "POST", "timeout" => 60, "header" => [ "Content-Type: application/x-www-form-urlencoded", "Content-Length: " . strlen($content), "Connection: close", ], "content" => $content, ]]); $data = file_get_contents("https://office.oris-info.ru/proxy/index.php", false, $context); return $data; } public function get_url($url, $proxy = true, $opts = [], $n = 1) { //$ss = $this->__get_url($url, $opts, "GET"); //return $ss ? $ss : $this->error("get_url: oris-proxy-error"); if(!$url || !preg_match("/^https?[:]\/\//",$url)) return $this->error("get_url: incorrect url = {$url}"); $log = "get_url: url = {$url}"; $ch = curl_init(); curl_setopt($ch, CURLOPT_URL, $url); curl_setopt($ch, CURLOPT_HEADER, false); curl_setopt($ch, CURLOPT_RETURNTRANSFER, 1); curl_setopt($ch, CURLOPT_TIMEOUT, $this->timeout); curl_setopt($ch, CURLOPT_FOLLOWLOCATION, true); curl_setopt($ch, CURLOPT_MAXREDIRS, 2); curl_setopt($ch, CURLOPT_SSL_VERIFYPEER, false); // curl_setopt($ch, CURLOPT_SSLVERSION, 2); if ($this->cookies) { curl_setopt($ch, CURLOPT_COOKIEFILE, $this->cookies); curl_setopt($ch, CURLOPT_COOKIEJAR, $this->cookies); } $headers = []; $headers = is_array($opts) ? $opts : [$opts]; $headers[] = "Accept-Language: ru-RU,ru;q=0.8,en-US;q=0.5,en;q=0.3"; $headers[] = "Cache-Control: max-age=0"; $headers[] = "Connection: keep-alive"; $headers[] = "User-Agent: Mozilla/5.0 (Windows NT 5.1; rv:31.0) Gecko/20100101 Firefox/31.0"; if ($proxy) { curl_setopt($ch, CURLOPT_PROXY, $this->proxy); $log .= ", proxy = {$this->proxy}"; } $this->__log($log); curl_setopt($ch, CURLOPT_HTTPHEADER, $headers); sleep(4); $ss = curl_exec($ch); $er = curl_error($ch); $ci = curl_getinfo($ch); curl_close($ch); $f = $er || $ci["http_code"] != 200; if ($f) { $this->__log("{$log} fails (err = {$er}, http_code = {$ci["http_code"]})"); return $proxy ? $this->get_url($url, false, $opts) : ""; } return $ss; } public function post_url($url, $proxy = true, $opts = [], $form = []) { //$ss = $this->__get_url($url, $opts, "POST"); //return $ss ? $ss : $this->error("post_url: oris-proxy-error"); if(!$url || !preg_match("/^https?[:]\/\//",$url)) return $this->error("post_url: incorrect url = {$url}"); $query = http_build_query($form); $log = "post_url: url = {$url}&{$query}"; $ch = curl_init(); curl_setopt($ch, CURLOPT_URL, $url); curl_setopt($ch, CURLOPT_POST, 1); curl_setopt($ch, CURLOPT_POSTFIELDS, $query); curl_setopt($ch, CURLOPT_RETURNTRANSFER, 1); curl_setopt($ch, CURLOPT_TIMEOUT, $this->timeout); curl_setopt($ch, CURLOPT_SSL_VERIFYPEER, false); if ($this->cookies) { curl_setopt($ch, CURLOPT_COOKIEFILE, $this->cookies); curl_setopt($ch, CURLOPT_COOKIEJAR, $this->cookies); } $headers = []; $headers = is_array($opts) ? $opts : [$opts]; $headers[] = "Accept-Language: ru-RU,ru;q=0.8,en-US;q=0.5,en;q=0.3"; $headers[] = "Cache-Control: max-age=0"; $headers[] = "Connection: keep-alive"; $headers[] = "User-Agent: Mozilla/5.0 (Windows NT 5.1; rv:31.0) Gecko/20100101 Firefox/31.0"; $headers[] = "Content-length: " . strlen($query); curl_setopt($ch, CURLOPT_PROXY, $this->proxy); $log .= ", proxy = {$this->proxy}"; $this->__log($log); curl_setopt($ch, CURLOPT_HTTPHEADER, $headers); sleep(4); $ss = curl_exec($ch); $er = curl_error($ch); $ci = curl_getinfo($ch); curl_close($ch); $f = $er || $ci["http_code"] != 200; if ($f) { $this->__log("{$log} fails (err = {$er}, http_code = {$ci["http_code"]})"); return ""; } return $ss; } public function session_start() { } public function session_end() { @unlink("{$_ENV[ROOT]}/../pids/{$this->site->code}-{$_SERVER[argv][2]}-{$_SERVER[argv][3]}-{$_SERVER[argv][4]}.pid"); } abstract public function getPages($url, $n = 0); abstract public function getItems($url, $n = 0); abstract public function getItem($item, $n = 0); abstract public function create_url($urlsfx, $p = 1); final public function checkItem($item) { if($item->date && $item->date <= $this->datelimit) return 0; $olditem = mysql_fetch_object(mysql_query("SELECT * FROM objects WHERE site_id = '{$this->site->id}' AND id = '{$item->id}'")); if(!$this->check_olditems) return $olditem ? 0 : 1; if( !$olditem ) return 1; if( !$olditem->display || $olditem->is_agency == 1 || $item->date <= $olditem->date || $olditem->category != $this->category || $olditem->deal_type != $this->deal_type || $olditem->category != "kvartiry" && $olditem->category != "komnaty" ) return 0; $d1 = date_create($item->date); $d2 = date_create($olditem->date); $r = date_diff($d1, $d2); if(!$r->days) return 0; $olditem->date_diff = $r->days; $olditem->price = __floatval(preg_replace('/[^\d.,]+/', '', $olditem->price)); $this->olditems[$item->id] = $olditem; return 1; } public function saveItem($item) { $item->site_id = $this->site->id; $item->date_add = date("Y-m-d H:i:s"); if(!$item->city) $item->city = $this->city; if(!$item->deal_type) $item->deal_type = $this->deal_type; if(!$item->category) $item->category = $this->category; $data = (array) $item; unset($data["city"]); unset($data["date_begin"]); unset($data["date_end"]); unset($data["proxy"]); unset($data["session_id"]); unset($data["raw_params"]); foreach ($data as &$_) $_ = mysql_real_escape_string($_); unset($_); # TODO: ON DUPLICATE KEY UPDATE ... $sql = "INSERT INTO objects (`" . implode("`, `", array_keys($data)) . "`) VALUES('" . implode("', '", $data) . "')"; mysql_query($sql) or $this->error(mysql_error()); if($item->display){ $item->object_id = mysql_insert_id(); if($item->phone > ""){ $tmp = explode("\n",preg_replace("/^7/sm","8",$item->phone)); $sql = "INSERT INTO objects_phones (object_id,phone) VALUES "; foreach($tmp as $_){ $_ = mysql_real_escape_string(trim($_)); $sql .= "('{$item->object_id}','{$_}'), "; } $sql = preg_replace("/,\s+$/","",$sql); $sql .= " ON DUPLICATE KEY UPDATE object_id = object_id"; mysql_query($sql) or $this->__log("saveItem(): " . mysql_error()); } $this->get_street($item); $this->get_house_number($item); $this->link_object($item); mysql_query("UPDATE objects SET link_id = '{$item->link_id}', link_s = '{$item->link_s}', street_id = '{$item->street_id}', streets = '{$item->streets}', house_number = '{$item->house_number}' WHERE object_id = '{$item->object_id}'"); mysql_query("UPDATE objects_phones SET link_id = '{$item->link_id}' WHERE object_id = '{$item->object_id}'"); } } public function updateItem($item) { $olditem = $this->olditems[$item->id]; $item->site_id = $this->site->id; $item->s = number_format($item->s, 2, '.', ''); $date_add = date("Y-m-d H:i:s"); if(!$item->city) $item->city = $this->city; if(!$item->deal_type) $item->deal_type = $this->deal_type; if(!$item->category) $item->category = $this->category; unset($item->city); unset($item->date_begin); unset($item->date_end); unset($item->proxy); unset($item->session_id); unset($item->raw_params); $cmpfields = ['date' => 0, 'price' => 0, 'phone' => 0, 'level' => 0, 'levels' => 0, 'rooms' => 0, 's' => 0, 'address' => 0]; $log = "UPDATED: [{$olditem->object_id}] : "; $sql = "UPDATE objects SET date_add = '{$date_add}'"; foreach((array) $item as $k => $v) { if(isset($cmpfields[$k])) { if($k == 'phone') { $t1 = explode("\n",preg_replace("/^7/sm","8", $item->phone)); $t2 = explode("\n",preg_replace("/^7/sm","8", $olditem->phone)); $d = array_diff($t1, $t2); $cmpfields[$k] = count($d); } else if($item->$k != $olditem->$k) { $log .= "{$k}: {$olditem->$k} --> {$item->$k}; "; $cmpfields[$k] = 1; } } else if($item->$k != $olditem->$k) { $log .= "{$k}; "; } $sql .= ", `{$k}` = '" . mysql_real_escape_string($v) . "'"; } $sql .= " WHERE object_id = '{$olditem->object_id}'"; mysql_query($sql) or $this->error(mysql_error()); $this->__log($log); if($item->display) { $item->date_add = $date_add; $item->object_id = $olditem->object_id; if($item->phone > ""){ $tmp = explode("\n",preg_replace("/^7/sm","8",$item->phone)); mysql_query("DELETE FROM objects_phones WHERE object_id = '{$item->object_id}'"); $sql = "INSERT INTO objects_phones (object_id,phone) VALUES "; foreach($tmp as $_){ $_ = mysql_real_escape_string(trim($_)); $sql .= "('{$item->object_id}','{$_}'), "; } $sql = preg_replace("/,\s+$/","",$sql); $sql .= " ON DUPLICATE KEY UPDATE object_id = object_id"; mysql_query($sql) or $this->__log("updateItem(): " . mysql_error()); } $this->get_street($item); $this->get_house_number($item); $this->link_object($item, $cmpfields); mysql_query("UPDATE objects SET link_id = '{$item->link_id}', link_s = '{$item->link_s}', street_id = '{$item->street_id}', streets = '{$item->streets}', house_number = '{$item->house_number}' WHERE object_id = '{$item->object_id}'"); mysql_query("UPDATE objects_phones SET link_id = '{$item->link_id}' WHERE object_id = '{$item->object_id}'"); mysql_query("UPDATE objects_links SET checked = '{$item->checked}' WHERE (link_id, link_s) = ('{$item->link_id}', '{$item->link_s}')") or die(mysql_error()); mysql_query("UPDATE objects SET checked = '{$item->checked}' WHERE (link_id, link_s) = ('{$item->link_id}', '{$item->link_s}')") or die(mysql_error()); mysql_query("INSERT INTO objects_notes (link_id, link_s, note) VALUES('{$item->link_id}','{$item->link_s}','" . mysql_real_escape_string($log) . "') ON DUPLICATE KEY UPDATE note = TRIM(CONCAT(note, '" . mysql_real_escape_string("\n" . $log) . "'))"); } } public function checkPhone($phones) { $arr = []; if (!is_array($phones)) $phones = [$phones]; foreach ($phones as $_) { $_ = trim($_); if ($_) $arr[] = $_; } if (!count($arr)) return 0; $n = mb_strlen(conf::$city->code, "utf-8") - 1; $code = mb_substr(conf::$city->code, 1, $n, "utf-8"); foreach ($arr as $phone){ /* $rt = mysql_fetch_object(mysql_query("SELECT * FROM phone_cache WHERE phone = '" . mysql_real_escape_string($phone) . "'")); if ($rt && $rt->is_agency) return 0; else if (!$rt) { $is_agency = 0; $url = "http://rent-scaner.ru/check-phone"; $data = $then->get_url($url); if ($data) { if (preg_match('@*\bname="csrf-token"])(?=[^>*\bcontent="(.*)"])[^>]*>@isU', $data, $tmp)) { $data = $this->post_url($url, false, [], ["_csrf" => $tmp[1], "CheckPhone[phone]" => $phone]); if ($data && preg_match()) } } mysql_query("INSERT INTO phone_cache (phone, is_agency) VALUES('" . mysql_real_escape_string($phone) . "', '{$is_agency}')"); if ($is_agency) return 0; } */ $tmp = mysql_real_escape_string(preg_replace("/^[78]{$code}/","",$phone)); $phone = mysql_real_escape_string(preg_replace("/^\d/","_",$phone)); $n = mysql_result(mysql_query("SELECT COUNT(*) FROM oris.oris_agencies WHERE phone > '' AND !deleted AND (phone = '{$tmp}' OR phone LIKE '{$phone}')"),0); if($n) return 0; } return 1; } public function grab($city, $deal_type, $category) { if( !isset($this->cities[$city]) || !isset($this->deals[$deal_type]) || !isset($this->categories[$category]) || !isset($this->sections[$deal_type][$category]) ) { return false; } $this->city = $city; $this->deal_type = $deal_type; $this->category = $category; $this->session_start(); $this->items = []; $sections = is_array($this->sections[$deal_type][$category]) ? $this->sections[$deal_type][$category] : [$this->sections[$deal_type][$category]]; foreach($sections as $urlsfx){ $url = $this->create_url($urlsfx); $pages = $this->getPages($url); $limit = $this->limit; for ($p = $this->startpage; $p <= $pages; $p++){ $url = $this->create_url($urlsfx,$p); $items = $this->getItems($url); foreach($items as $item){ if(!$this->checkItem($item)) $limit--; else{ $limit = $this->limit; $this->items[$item->id] = $item; } if(!$limit) break 2; } } } $n = count($this->items); $this->__log("\n\n=========\n ITEMS FOUND: {$n}\n=========\n\n"); $this->items = array_reverse($this->items); foreach($this->items as $item){ $this->getItem($item) ? null : $this->saveUrl($item); } $this->session_end(); return true; } public function cron() { foreach($this->cities as $city => $city_id){ foreach($this->sections as $deal_type => $params){ foreach($params as $category => $v){ $logfile = "{$_ENV[ROOT]}/../logs/{$this->site->code}-{$city}-{$deal_type}-{$category}.log"; $pidfile = "{$_ENV[ROOT]}/../pids/{$this->site->code}-{$city}-{$deal_type}-{$category}.pid"; if(is_file($pidfile)){ $pid = file_get_contents($pidfile); shell_exec("kill -9 {$pid} > /dev/null"); } $cmd = "php {$_ENV[ROOT]}/grabber.php {$this->site->code} {$city} {$deal_type} {$category} > {$logfile} 2>&1 & echo $!"; echo "[",date("Y-M-d H:i:s"),"] ", $cmd ,"\n"; $pid = shell_exec($cmd); file_put_contents($pidfile,$pid); sleep(4*60); } } } } public function get_street(&$item) { $item->street_id = 0; $item->streets = ""; $res = [street_id => &$item->street_id, streets => &$item->streets]; if(!$item->category) return $res; $re = "/\b" . preg_quote(conf::$city->name,"/") . "\b/ui"; $address = trim(preg_replace($re,"",$item->address)); if(mb_strlen($address,"utf-8") < 4) $address = preg_replace($re,"",$item->title); $address = preg_replace("/c/i","с",$address); $address = preg_replace("/ё/i","е",$address); $address = preg_replace("/\bлет.*\b/Uui","лет",$address); $address = preg_replace("/\bпереулок\b/Uui","пер",$address); $address = preg_replace("/\bпроспект\b/Uui","пр",$address); $address = preg_replace("/\bтракт\b/Uui","тр",$address); $address = preg_replace("/\b.*\bр(?:айо|\-о?)н\b/Uui","",$address); $address = preg_replace("/(\d+(?:\-я)?)/ui"," $1 ",$address); $address = mysql_real_escape_string($address); $__address = $item->address . PHP_EOL . $item->title; foreach (conf::$stopwords as $a => $id) { if(preg_match("/\b{$a}/isu", $__address)) { $item->street_id = $id; $item->streets = $id; return $res; } } $item->street_id = 0; $item->streets = ""; $re = "/\b(ново|старо|дальне|нижне|верхне|северо|юго|южно|западно|восточно|средне)(.)/ui"; $r = 0; if(preg_match($re,$address,$tmp)){ $r = 1; if(preg_match("/[^а-я]/ui",$tmp[2])){ $r = 2; $address = preg_replace("/\b{$tmp[1]}[^а-я]+/ui",$tmp[1],$address); } } $sql = "SELECT id, MATCH(name, prefix) AGAINST ('%s') AS rank FROM oris_streets WHERE !deleted HAVING rank > 0 ORDER BY rank DESC LIMIT 8"; $db = mysql_query(sprintf($sql,mysql_real_escape_string($address))); if(!mysql_num_rows($db) && $r == 2){ $address = preg_replace($re,"$1-$2",$address); $db = mysql_query(sprintf($sql,mysql_real_escape_string($address))); } $re = "/\b(?:прода.*|аренд.*|сда|сниму|квартир.*|комнат.*|адрес.*|цена|окн.*|лоджи.*|балкон|совм.*|разд.*|ремонт|договор|торг)\b/Uui"; if(!mysql_num_rows($db) && $item->description){ $tmp = preg_replace($re,"",$item->description); $db = mysql_query(sprintf($sql,mysql_real_escape_string($tmp))); } if(!mysql_num_rows($db)){ $address = preg_replace($re,"",$address); $tmp = preg_split("/[^а-я]+/ui",$address); $address = ""; foreach($tmp as $_) { $n = mb_strlen($_,"utf-8"); if($n >= 4) $address .= mb_substr($_,0,$n - ($n < 6 ? 1 : 2),"utf-8") . "* "; } $db = mysql_query("SELECT id, MATCH(name, prefix) AGAINST ('>" . mysql_real_escape_string($address) . "' IN BOOLEAN MODE) AS rank FROM oris_streets WHERE !deleted HAVING rank > 0 ORDER BY rank DESC LIMIT 8"); } $r = 0; while($_ = mysql_fetch_object($db)){ if(!$item->street_id){ $item->street_id = $_->id; $r = $_->rank; } // if($r != $_->rank) break; $item->streets .= $_->id . ","; } $item->streets = preg_replace("/,$/","",$item->streets); return $res; } public function get_house_number(&$item) { $item->house_number = ""; $item->address = trim($item->address); if(preg_match("/\D([1-9]\d*\D{0,6})$/iu",$item->address)){ preg_match_all("/\D(\d+)/iu",$item->address,$tmp,PREG_OFFSET_CAPTURE); $tmp = $tmp[1]; $e2 = array_pop($tmp); while($e1 = array_pop($tmp)){ if($e2[1] - $e1[1] > 6) break; $e2 = $e1; } $item->house_number = substr($item->address, $e2[1]); $item->house_number = preg_replace("/\\\/","/",$item->house_number); } $item->house_number = mysql_real_escape_string($item->house_number); } public function get_link_s($s) { return $s ? ceil(log($s/(1 - $this->dS)) / log((1 + $this->dS)/(1 - $this->dS))) : 0; } # TODO: rename "link" to "group", more correct public function link_object(&$item, $update = false) { $item->link_s = $this->get_link_s($item->s); $item->link_id = $item->object_id; $res = [link_id => &$item->link_id, link_s => &$item->link_s]; if(!$item->category || $item->category != "kvartiry" && $item->category != "komnaty") return $res; if($item->street_id){ # TODO: review this # use link_s in search, make link_id as single key whitout link_s $tmp = mysql_fetch_assoc(mysql_query(" SELECT o.link_id FROM objects o INNER JOIN objects_phones p ON o.object_id = p.object_id INNER JOIN objects_phones t ON p.phone = t.phone AND p.object_id != t.object_id WHERE o.display AND o.link_id AND t.object_id = '{$item->object_id}' AND o.category = '{$item->category}' AND o.deal_type = '{$item->deal_type}' AND o.street_id = '{$item->street_id}' AND o.level = '{$item->level}' AND o.rooms = '{$item->rooms}' LIMIT 1; ")); if($tmp["link_id"]) $item->link_id = $tmp["link_id"]; } $link = new stdClass(); foreach(["date","date_add","link_id","link_s","street_id","level","rooms","deal_type","is_agency"] as $f) $link->$f = $item->$f; $link->url = ""; $link->checked = 0; $link->number = "0"; $link->phone = []; $link->price = []; $link->images = []; $link->sites = []; $link->is_delayed = false; $tmp = [ images => 0, url => 0, ]; $code = conf::$city->code; $re = "/^" . $code . "/"; $streets = explode(",", $item->streets); $streets = array_combine($streets, $streets); # TODO: review algorythm # select $link from `objects_links` first, then compare with $item (see previous todo) # so next loop can be removed # (???) races with other process -> rebuild `object_links` with another unique proccess/cron, NOT HERE!! -> so grabbers could work faster!! # store combined streets of linked objects in `objects_links` $dbc = mysql_query("SELECT * FROM objects WHERE object_id = '{$item->object_id}' OR link_id = '{$item->link_id}' AND link_s = '{$item->link_s}' ORDER BY date DESC, date_add DESC"); $link->count = mysql_num_rows($dbc); $checked = 0; while ($_ = mysql_fetch_object($dbc)) { foreach (["title","raion","address","distance","params","seller","is_agency"] as $f) if (mb_strlen($link->$f,"utf-8") < mb_strlen($_->$f,"utf-8")) $link->$f = $_->$f; if($_->date > $link->date) $link->date = $_->date; $link->sites[$_->site_id] = $this->sites[$_->site_id]->name; $_->phone = explode("\n",preg_replace("/^7/sm", "8", trim($_->phone))); foreach($_->phone as $f) { $f = trim($f); if($f > "" && !isset($link->phone[$f])){ $link->phone[$f] = [mysql_real_escape_string($f)]; if(preg_match($re, $f)) $link->phone[$f][] = mysql_real_escape_string(preg_replace($re, "", $f)); } } if(!count($link->images) && trim($_->images) > ""){ $_->images = preg_split("/(?:\r?\n){1,}/", trim($_->images)); $tmp["images"] = $_->site_id; $link->images = $_->images; } if(!$link->url && trim($_->url) > ""){ $tmp["url"] = $_->site_id; $link->url = trim($_->url); } if(!$link->description && trim($_->description) > "") { $link->description = trim($_->description); } if(!count($link->price)) $link->price = [$_->s => [$_->price]]; if($_->checked && !$checked) $checked = $_->checked; } # images $host = $this->sites[$tmp["images"]]->host; $host = "http" . (preg_match("/[:]443$/", $host) ? "s" : "") . "://" . $host; foreach($link->images as &$_){ if(!preg_match("/^https?\:\/\//is", $_)) $_ = $host . $_; } unset($_); $link->images = implode("\n", $link->images); # url if(!preg_match("/^https?\:\/\//is",$link->url)){ $host = $this->sites[$tmp["url"]]->host; $host = "http" . (preg_match("/[:]443$/",$host) ? "s" : "") . "://" . $host; $link->url = $host . $link->url; } $link->sites = implode(", ",$link->sites); # prices /*foreach($link->price as &$_) { $_ = array_values($_); sort($_, SORT_STRING); } unset($_); ksort($link->price, SORT_NUMERIC);*/ # phones $tmp = ""; foreach($link->phone as &$_){ $tmp .= "'" . implode("','",$_) . "', "; $_ = ""; } unset($_); $tmp = preg_replace("/,\s+$/","",$tmp); $dbc = mysql_query(" SELECT t1.date_check, t1.object_id, t1.square, t1.rooms_count, t1.level, t1.price, t1.phone_home, t1.phone_work, t1.phone_cell, t1.deleted, t1.street_id, t1.number, IF(t2.object_id IS NULL,IF(t3.object_id IS NULL, NULL, 'sdam'), 'prodam') AS deal_type FROM oris.oris_objects t1 LEFT JOIN oris.oris_flat_sale t2 ON t1.object_id = t2.object_id LEFT JOIN oris.oris_flat_rent t3 ON t1.object_id = t3.object_id WHERE t1.phone_home IN ({$tmp}) OR t1.phone_work IN ({$tmp}) OR t1.phone_cell IN ({$tmp}) ORDER BY t1.date_check DESC "); $tmp = ["phone_home", "phone_work", "phone_cell"]; $__cache = []; $__deal_type = []; unset($link->same, $link->archive, $__price); while($_ = mysql_fetch_object($dbc)){ $__deal_type[$_->deal_type] = 1; if (isset($streets[$_->street_id]) && $link->deal_type == $_->deal_type) { $_->link_s = $this->get_link_s($_->square); if (!isset($link->same) || $link->same == 255) { $link->same = $_->deleted ? 255 : 1; } if ($link->level == $_->level && $link->rooms == $_->rooms_count && ($link->link_s == $_->link_s || abs($item->s - $_->square) <= 1) ) { $link->street_id = $_->street_id; $link->archive = isset($link->archive) ? $link->archive && $_->deleted : $_->deleted; if ($link->number === "0") $link->number = $_->number; if (!$_->deleted && !isset($__price)) $__price = $_->price; foreach($tmp as $f) { $f = $_->$f; if (!$f) continue; if (mb_strlen($f, "utf-8") < 11) $f = $code . $f; if (isset($link->phone[$f])) { $link->phone[$f] = $link->phone[$f] === "" ? $_->deleted : $link->phone[$f] && $_->deleted; $__cache[$f] = 1; } } continue; } } foreach($tmp as $f) { $f = $_->$f; if(!$f) continue; if(mb_strlen($f, "utf-8") < 11) $f = $code . $f; if(!isset($__cache[$f]) && isset($link->phone[$f])) $link->phone[$f] = $link->phone[$f] === "" ? $_->deleted : $link->phone[$f] && $_->deleted; } } # проверяем отсутствие телефона в категории + признак архива $__f = !isset($__deal_type[$link->deal_type]) || $link->archive; # проверяем наличие новых и архивных телефонов if (!$__f) foreach ($link->phone as $_) { if ($_ === "" || $_ === "1" || $_ === true) { $__f = 1; break; } } # объявление - неархивный "полный дубль" без новых телефонов if (!$__f && !$link->archive && $link->number !== "0" && isset($__price)) { # сравниваем цены $price = __floatval(current(current($link->price))); $t = []; $t[$price] = 1; $t[$price / 1000] = 1; $t[$price * 1000] = 1; $link->checked = isset($t[$__price]) ? 1 : 2; } else if (!$__f) { # повторное получение объявления сайта if ($update && isset($this->olditems[$item->id])) { $olditem = $this->olditems[$item->id]; $diff = ['sdam' => 30, 'prodam' => 90]; // if ($checked && ($olditem->date_diff >= $diff[$item->deal_type] || $update['price'])) { if ($checked && $olditem->date_diff) { $checked = 2; } unset($update['date'], $update['price'], $update['levels']); foreach($update as $_) { if($_) { $checked = 0; break; } } } $link->checked = $checked; } if(!$update && $link->checked) { mysql_query("UPDATE objects SET checked = '{$link->checked}' WHERE object_id = '{$item->object_id}' OR (link_id, link_s) = ('{$item->link_id}', '{$item->link_s}')") or die(mysql_error()); } $item->checked = $link->checked; $link->phone = serialize($link->phone); $link->price = serialize($link->price); $data = (array) $link; unset($data["city"]); unset($data["category"]); foreach ($data as &$_) $_ = mysql_real_escape_string($_); unset($_); $sql = "REPLACE INTO objects_links (`" . implode("`, `", array_keys($data)) . "`) VALUES('" . implode("', '", $data) . "')"; mysql_query($sql) or $this->error(mysql_error()); return $res; } public function saveUrl($item) { $item->site_id = $this->site->id; $item->url = mysql_real_escape_string($item->url); if(!$item->city) $item->city = $this->city; if(!$item->deal_type) $item->deal_type = $this->deal_type; if(!$item->category) $item->category = $this->category; mysql_query("INSERT INTO objects_errors (site_id,id,url,date_add,date_check,fails,city,deal_type,category) VALUES('{$item->site_id}','{$item->id}','{$item->url}',NOW(),NOW(),0,'{$item->city}','{$item->deal_type}','{$item->category}') ON DUPLICATE KEY UPDATE fails = fails + 1, date_check = NOW()") or $this->__log("saveUrl(): " . mysql_error()); } public static function parse_level($s) { $result = []; $mathes = []; $count = 0; $patterns = [ # 2-х этажный '/\b(\d+)\D{0,4}\bэт(?:\b|аж)/ismu', # кол-во этажей 5 '/\bэтаж(?:ей?|ность)?\b\D{0,4}(\d+)\b/ismu', # Этаж 4 из 16, Этаж 4/17 '/\bэтаж\b\D{1,4}\b\d+\s*(?:\bиз\b|\/)\s*(\d+)\b/ismu' ]; foreach($patterns as $rx){ $r = preg_match_all($rx,$s,$tmp); if($r){ $mathes = array_merge($mathes, $tmp[1]); $count += $r; } } # 1-к вартиру, 6/18 30/12/7 if(!$count && $r = preg_match_all('/\b(\d+)[^\d\/]{0,3}\/[^\d\/]{0,3}(\d+)[^\d\/]{3,}/ismu',$s . "___",$tmp)){ for($i = 0; $i < $r; $i++){ if($tmp[1][$i] <= $tmp[2][$i]){ $mathes[] = $tmp[1][$i]; $count++; } } } return $count ? min($mathes) : 0; } public static function parse_levels($s) { $mathes = []; $count = 0; $patterns = [ # 2-х этажный/уровневый '/\b(\d+)\D{0,4}\b(?:эт(?:\b|аж)|уров)/ismu', # кол-во этажей/уровней 5; этажей в здании '/ \b(?:этаж(?:ей?|ность)?|уровн?е(?:нь|й|вый)?)\b (?:\s+в\s+(?:доме|здании|помещении|коттедже))? \D{0,4} (\d+)\b /ismux', # Этаж 4 из 16, Этаж 4/17 '/\bэтаж\b\D{1,4}\b\d+\s*(?:\bиз\b|\/)\s*(\d+)\b/ismu' ]; foreach($patterns as $rx){ $r = preg_match_all($rx,$s,$tmp); if($r){ $mathes = array_merge($mathes, $tmp[1]); $count += $r; } } # 1-к вартиру, 6/18 30/12/7 if(!$count && $r = preg_match_all('/\b(\d+)[^\d\/]{0,3}\/[^\d\/]{0,3}(\d+)[^\d\/]{3,}/ismu',$s . "___",$tmp)){ for($i = 0; $i < $r; $i++){ if($tmp[1][$i] <= $tmp[2][$i]){ $mathes[] = $tmp[2][$i]; $count++; } } } return $count ? max($mathes) : 0; } public static function parse_square($s) { $result = []; $mathes = []; $count = 0; $patterns = [ '/ \b(\d+(?:[,.]\d+)?) \s*\/\s* (?:\d+(?:[,.]\d+)?|-) \s*\/\s* (?:\d+(?:[,.]\d+)?|-)\b /ismx', '/ \b(\d+(?:[.,]\d+)?) \D{0,3} (?:кв[.\s]\s*м|м\s*[2²]\b|квадрат\w{0,2}) /ismux', '/ \bпл(?:ощадь)?\b (?:\s+(?:общая|жилая|дома|квартиры|комнаты|коттеджа|офиса|помещения|здания)\b)? [\s.,:;\(\)]+ (?:в\s*)? (?:кв[.\s]\s*м|м\s*[2²]\b) [\s.,:;\(\)]+ (\d+(?:[,.]\d+)?) /ismux', '/ \b(?:комнат[ау]|квартир[ау]|дом|помещение|офис)\b [\s.,:;\(\)]+ (\d+(?:[,.]\d+)?) \D{0,3}м /ismux', ]; foreach($patterns as $rx){ $r = preg_match_all($rx,$s,$tmp); if($r){ foreach($tmp[1] as &$_) $_ = __floatval($_); unset($_); $mathes = array_merge($mathes, $tmp[1]); $count += $r; } } return $count ? max($mathes) : 0; } public static function parse_land($s) { $result = []; $mathes = []; $count = 0; $patterns = [ '/\b(\d+(?:[,.]\d+)?)\b\D{0,4}\b(?:сот|га|гектар)/ismu', '/ \b(?:участ(?:ок|к[ае])|земл[иея]) [\s.,:;\(\)]+ (?:в\s*)? (?:сот(?:ок|ках)?|га|гектар(?:ах)?) [\s.,:;\(\)]+ (\d+(?:[,.]\d+)?) /ismux', ]; foreach($patterns as $rx){ $r = preg_match_all($rx,$s,$tmp); if($r){ foreach($tmp[1] as &$_) $_ = __floatval($_); unset($_); $mathes = array_merge($mathes, $tmp[1]); $count += $r; } } return $count ? max($mathes) : 0; } public static function parse_rooms($s, $c = "") { $arr = [ "одн" => 1, "дву" => 2, "тр" => 3, "четыр" => 4, "пяти" => 5, "шести" => 6, ]; if(preg_match('/ \b( (?:одн[уеё]|дву|тр[её]|четыр[её])шк | (?:одно?|двух?|тр[её]х?|четыр[её]х?|пяти?|шести?)\W{0,3}комн ) /ismxu', $s, $tmp)) foreach($arr as $k => $v) if(strpos($tmp[1],$k) !== false) return $v; if(preg_match('/\b(\d)\s*[\D\S]{0,3}\s*к(?:ом(?:н(?:ат\w{0,3})?)?)?\b/ismu', $s, $tmp)) return $tmp[1]; if(preg_match('/\bкомн\w{0,2}\D{0,3}(\d+)\D/ismu', $s, $tmp)) return $tmp[1]; return ($c == "kvartiry" || $c == "komnaty" ? 1 : 0); } public function isValidItemHtml($html, $item) { return $this->error("isValidItemHtml: pseudo abstract method, must be overriden"); } protected function get_mirror($deleteCurrent = false) { if(!count($this->mirrors)) return $this->error("get_mirror: mirror list is empty"); $mirror = $this->mirror ? next($this->mirrors) : current($this->mirrors); if(!$mirror){ reset($this->mirrors); $mirror = current($this->mirrors); } if($deleteCurrent){ $this->__log("deleting mirror: {$this->mirror}"); reset($this->mirrors); foreach($this->mirrors as $k => $v){ if($v == $this->mirror){ unset($this->mirrors[$k]); break; } } } $this->mirror = $mirror; return $this->mirror; } protected static function url_get_contents($url, $params = [], $method = "GET") { if($method == "GET") { if(is_array($params) && count($params)) $url = $url . "?" . http_build_query($params); echo "[" . date("Y-m-d H:i:s") . "] url_get_contents: {$url}\n"; $context = stream_context_create([ http => [ "protocol_version" => 1.1, "method" => "GET", "timeout" => 60, "header" => [ "Connection: close" ], ]]); } elseif($method == "POST") { echo "[" . date("Y-m-d H:i:s") . "] url_get_contents: {$url}\n" . print_r($params, 1); $content = http_build_query($params); $context = stream_context_create([ http => [ "protocol_version" => 1.1, "method" => "POST", "timeout" => 60, "header" => [ "Content-Type: application/x-www-form-urlencoded", "Content-Length: " . strlen($content), "Connection: close", ], "content" => $content, ]]); } else { die ("[" . date("Y-m-d H:i:s") . "] Unknown request method: \"{$method}\"\n"); } $data = file_get_contents($url, false, $context); sleep(3); return $data; } protected function mirror_get_contents($params, $method = "GET") { $this->get_mirror(); if(!is_array($params)) { if(preg_match("@^\d+$@", $params)) $params = [item_id => $params]; else if(preg_match("@^https?://@is", $params)) $params = [url => $params]; else $params = []; } $data = @gzdecode(self::url_get_contents($this->mirror, $params, $method)); if(!$data) { $this->__log($this->mirror . " failed, trying next..."); $this->get_mirror(true); return $this->mirror_get_contents($params, $method); } return $data; } protected function get_url_head($url, $proxy = false, $opts = []) { $headers = []; $headers = is_array($opts) ? $opts : [$opts]; $headers[] = "Accept-Language: ru-RU,ru;q=0.8,en-US;q=0.5,en;q=0.3"; $headers[] = "Cache-Control: max-age=0"; $headers[] = "Connection: keep-alive"; $headers[] = "User-Agent: Mozilla/5.0 (Windows NT 5.1; rv:31.0) Gecko/20100101 Firefox/31.0"; $curl = curl_init(); if($proxy){ $this->get_proxy($proxy === true ? "" : $proxy); curl_setopt($ch, CURLOPT_PROXY, $this->proxy); $headers[] = "X-Forwarded-For: {$this->__proxy->ip}"; } else{ $this->__proxy = null; $this->proxy = null; sleep(3); } curl_setopt($curl, CURLOPT_HTTPHEADER, $headers); curl_setopt($curl, CURLOPT_TIMEOUT, $this->timeout); curl_setopt($curl, CURLOPT_URL, $url); curl_setopt($curl, CURLOPT_NOBODY, true); curl_setopt($curl, CURLOPT_RETURNTRANSFER, true); curl_setopt($curl, CURLOPT_SSL_VERIFYPEER, false); curl_exec($curl); $info = curl_getinfo($curl); curl_close($curl); return $info; } public function setSiteCodeAlias($code){ $this->site->code_alias = $code; } }