update_status_lianjia.py

# -*- coding: utf-8 -*-

import re
from bs4 import BeautifulSoup as bs
import datetime
import time
import sys
import os
import pymongo
import xlsxwriter
import json
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.common.exceptions import TimeoutException
from selenium.common.exceptions import WebDriverException
from urllib3.exceptions import ProtocolError
from lxml import etree

client = pymongo.MongoClient("192.168.3.88", 27017)
db = client.fang.get_collection('lianjia_2sf_info_new')
now_month = datetime.datetime.now().strftime("%Y-%m")
now = datetime.datetime.now().strftime("%Y-%m-%d")

   
def gethtml(inurl, url_id):
    user_agent = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.88 Safari/537.36'
    chrome_options = Options()
    chrome_options.add_argument("--no-sandbox")
    chrome_options.add_argument("--headless")
    chrome_options.add_argument("--disable-dev-shm-usage")
    chrome_options.add_argument('user-agent={user_agent}')
    chrome_options.add_experimental_option('excludeSwitches', ['enable-automation'])
    driver = webdriver.Chrome(options=chrome_options)
    driver.set_page_load_timeout(60)
    driver.set_script_timeout(60)
    
    try:
        driver.get(inurl)
    
    except TimeoutException:
        os.system("echo %s >> %s_update_error_urls.txt" % (inurl,now_month))
        html = None
   
    except WebDriverException:
        os.system("echo %s >> %s_update_error_urls.txt" % (inurl,now_month))
        html = None
    
    except ConnectionResetError:
        os.system("echo %s >> %s_update_error_urls.txt" % (inurl,now_month))
        html = None
    
    except ProtocolError:
        os.system("echo %s >> %s_update_error_urls.txt" % (inurl,now_month))
        html = None
    else:
        html = driver.page_source
    driver.quit()
    
    if html:
        html = etree.HTML(html)
        q_sale_status = html.xpath('/html/body/div[3]/div/div/div[1]/h1/span/text()')  # 是否在售的状态,如果为空则在售
        q_sale_status = ''.join(q_sale_status)
        q_sale_all = html.xpath('/html/body/div[5]/div[2]/div[3]/*//text()')[0:2]   # 总价
        q_sale_all = ''.join(q_sale_all)
        q_sale_m = html.xpath('/html/body/div[5]/div[2]/div[3]/div[1]/div[1]/*//text()')   # 每平单价
        q_sale_m = ''.join(q_sale_m)
        if len(q_sale_status) == 0:
            db.update({"id_":url_id} ,{"$set": {"状态":"在售", "总售价":q_sale_all, "每平单价":q_sale_m, "更新时间":now}})
        else:
            db.update({"id_":url_id} ,{"$set": {"状态":q_sale_status.strip(), "更新时间":now}})
    else:
        pass




# esf_info = db.find({"$or": [{'更新时间': None}, {'状态': '在售'}, {'状态': '已下架'}]}).batch_size(10).skip(78087)
esf_info = db.find({"$or": [{'更新时间': None}, {'状态': '在售'}, {'状态': '已下架'}]}).batch_size(10)
count = 1

for info in esf_info:
    inurl = info['页面链接']
    print(count, inurl)
    count += 1
    url_id = inurl.split('/')[-1].split('.')[0]
    gethtml(inurl, url_id)

db.update({"总售价": ""}, {"$set": {"状态": "已出售"}}, multi=True)

最后更新于