get_lianjia_2sf.py

# -*- coding: utf-8 -*-

# import requests
import re
from bs4 import BeautifulSoup as bs
import datetime
import time
import sys
import os
import pymongo
import xlsxwriter
import json
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.common.exceptions import TimeoutException
from selenium.common.exceptions import WebDriverException
from urllib3.exceptions import ProtocolError
from lxml import etree

client = pymongo.MongoClient("192.168.3.88", 27017)
db = client.fang.get_collection('lianjia_2sf_info_new')
now_month = datetime.datetime.now().strftime("%Y-%m")
now = datetime.datetime.now().strftime("%Y-%m-%d")



def get_page_number(inurl_page):
    user_agent = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.88 Safari/537.36'
    chrome_options = Options()
    chrome_options.add_argument("--no-sandbox")
    chrome_options.add_argument("--headless")
    chrome_options.add_argument("--disable-dev-shm-usage")    
    chrome_options.add_argument('user-agent={user_agent}')
    chrome_options.add_experimental_option('excludeSwitches', ['enable-automation'])
    driver = webdriver.Chrome(options=chrome_options)
    driver.set_page_load_timeout(60)
    driver.set_script_timeout(60)
    driver.get(inurl_page)
    html = driver.page_source
    driver.quit()
    html_page = etree.HTML(html)
    all_house_number = html_page.xpath('/html/body/div[4]/div[1]/div[2]/h2/span/text()')
    all_house_number = ''.join(all_house_number)
    all_house_number = int(all_house_number)
    all_page_number = all_house_number // 30
    # if all_page_number > 100:
        # all_page_number = 101
    # else:
        # all_page_number += 2
    all_page_number += 2
        
    return(all_page_number)


def get_xq_url(inurl_list):
    user_agent = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.88 Safari/537.36'
    chrome_options = Options()
    chrome_options.add_argument("--headless")
    chrome_options.add_argument("--disable-dev-shm-usage")
    chrome_options.add_argument("--no-sandbox")
    chrome_options.add_argument('user-agent={user_agent}')
    chrome_options.add_experimental_option('excludeSwitches', ['enable-automation'])
    driver = webdriver.Chrome(options=chrome_options)
    driver.set_page_load_timeout(60)
    driver.set_script_timeout(60)
    driver.get(inurl_list)
    html = driver.page_source
    # return(html)
    driver.quit()
    html_list = etree.HTML(html)
    all_url = html_list.xpath('/html/body/div[4]/div[1]/ul/li/div/div[1]/a//@href')
    return(all_url)

def get_esf_url_nb(inurl_list):
    user_agent = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.88 Safari/537.36'
    chrome_options = Options()
    chrome_options.add_argument("--headless")
    chrome_options.add_argument("--disable-dev-shm-usage")
    chrome_options.add_argument("--no-sandbox")
    chrome_options.add_argument('user-agent={user_agent}')
    chrome_options.add_experimental_option('excludeSwitches', ['enable-automation'])
    driver = webdriver.Chrome(options=chrome_options)
    driver.set_page_load_timeout(60)
    driver.set_script_timeout(60)
    driver.get(inurl_list)
    html = driver.page_source
    # return(html)
    driver.quit()
    html_page = etree.HTML(html)
    # all_url = html_list.xpath('//*[@id="content"]/div[1]/ul/li/div[1]/div[1]/a//@href')
    
    all_xq_number = html_page.xpath('/html/body/div[4]/div[1]/div[2]/h2/span/text()')
    all_xq_number = ''.join(all_xq_number)
    all_xq_number = int(all_xq_number)
    all_xq_page_number = all_xq_number // 30
    # if all_xq_page_number > 100:
        # all_xq_page_number = 101
    # else:
        # all_xq_page_number += 2
    all_xq_page_number += 2
    return(all_xq_page_number)    

def get_esf_url(inurl):
    user_agent = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.88 Safari/537.36'
    chrome_options = Options()
    chrome_options.add_argument("--headless")
    chrome_options.add_argument("--disable-dev-shm-usage")
    chrome_options.add_argument("--no-sandbox")
    chrome_options.add_argument('user-agent={user_agent}')
    chrome_options.add_experimental_option('excludeSwitches', ['enable-automation'])
    driver = webdriver.Chrome(options=chrome_options)
    driver.set_page_load_timeout(60)
    driver.set_script_timeout(60)
    driver.get(inurl_list)
    html = driver.page_source
    # return(html)
    driver.quit()
    html_list = etree.HTML(html)
    all_url = html_list.xpath('//*[@id="content"]/div[1]/ul/li/div[1]/div[1]/a//@href')
    
    return(all_url)    
    
def gethtml(inurl, url_id):
    # print(inurl)
    user_agent = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.88 Safari/537.36'
    chrome_options = Options()
    chrome_options.add_argument("--headless")
    chrome_options.add_argument("--disable-dev-shm-usage")
    chrome_options.add_argument("--no-sandbox")
    chrome_options.add_argument('user-agent={user_agent}')
    chrome_options.add_experimental_option('excludeSwitches', ['enable-automation'])
    driver = webdriver.Chrome(options=chrome_options)
    driver.set_page_load_timeout(60)
    driver.set_script_timeout(60)
    
    try:
        driver.get(inurl)
    
    except TimeoutException:
        print ("This URL requests timed out~~~~~~~~~~~~~~~~~~~~~~~~~~~")
        os.system("echo %s >> %s_error_urls.txt" % (inurl,now_month))
        html = None
   
    except WebDriverException:
        print ("This URL requests timed out~~~~~~~~~~~~~~~~~~~~~~~~~~~")
        os.system("echo %s >> %s_error_urls.txt" % (inurl,now_month))
        html = None
    
    except ConnectionResetError:
        print ("This URL requests timed out~~~~~~~~~~~~~~~~~~~~~~~~~~~")
        os.system("echo %s >> %s_error_urls.txt" % (inurl,now_month))
        html = None
    
    except ProtocolError:
        print ("This URL requests timed out~~~~~~~~~~~~~~~~~~~~~~~~~~~")
        os.system("echo %s >> %s_error_urls.txt" % (inurl,now_month))
        html = None
    else:
        html = driver.page_source
    driver.quit()
    
    if html:
        html = etree.HTML(html)
        q_title = html.xpath('/html/body/div[3]/div/div/div[1]/h1/text()')  # 标题
        q_title = ''.join(q_title)
        q_sale_all = html.xpath('/html/body/div[5]/div[2]/div[3]/*//text()')[0:2]   # 总价
        q_sale_all = ''.join(q_sale_all)
        q_sale_m = html.xpath('/html/body/div[5]/div[2]/div[3]/div[1]/div[1]/*//text()')   # 每平单价
        q_sale_m = ''.join(q_sale_m)
        q_name = html.xpath('/html/body/div[5]/div[2]/div[5]/div[1]/a[1]/text()')   # 小区名
        q_name = ''.join(q_name)
        q_area = html.xpath('/html/body/div[5]/div[2]/div[5]/div[2]/span[2]/*//text()')[0]     # 所在区域
        q_area = ''.join(q_area)
        q_area_2 = html.xpath('/html/body/div[5]/div[2]/div[5]/div[2]/span[2]/*//text()')[1]     # 所在街道
        q_area_2 = ''.join(q_area_2)
        
        q_hx = html.xpath('//*[@id="introduction"]/div/div/div[1]/div[2]/ul/li[1]/text()')
        q_hx = ''.join(q_hx)
        q_jzmj = html.xpath('//*[@id="introduction"]/div/div/div[1]/div[2]/ul/li[3]/text()')
        q_jzmj = ''.join(q_jzmj)
        q_tnmj = html.xpath('//*[@id="introduction"]/div/div/div[1]/div[2]/ul/li[5]/text()')
        q_tnmj = ''.join(q_tnmj)
        q_cx = html.xpath('//*[@id="introduction"]/div/div/div[1]/div[2]/ul/li[7]/text()')
        q_cx = ''.join(q_cx)
        q_zx = html.xpath('//*[@id="introduction"]/div/div/div[1]/div[2]/ul/li[9]/text()')
        q_zx = ''.join(q_zx)
        q_dt = html.xpath('//*[@id="introduction"]/div/div/div[1]/div[2]/ul/li[11]/text()')
        q_dt = ''.join(q_dt)
        q_lc = html.xpath('//*[@id="introduction"]/div/div/div[1]/div[2]/ul/li[2]/text()')
        q_lc = ''.join(q_lc)
        q_hxjg = html.xpath('//*[@id="introduction"]/div/div/div[1]/div[2]/ul/li[4]/text()')
        q_hxjg = ''.join(q_hxjg)
        q_jzlx = html.xpath('//*[@id="introduction"]/div/div/div[1]/div[2]/ul/li[6]/text()')
        q_jzlx = ''.join(q_jzlx)
        q_jzjg = html.xpath('//*[@id="introduction"]/div/div/div[1]/div[2]/ul/li[8]/text()')
        q_jzjg = ''.join(q_jzjg)
        q_htbl = html.xpath('//*[@id="introduction"]/div/div/div[1]/div[2]/ul/li[10]/text()')
        q_htbl = ''.join(q_htbl)
        
        q_gpsj = html.xpath('//*[@id="introduction"]/div/div/div[2]/div[2]/ul/li[1]/span[2]/text()')
        q_gpsj = ''.join(q_gpsj)
        q_scjy = html.xpath('//*[@id="introduction"]/div/div/div[2]/div[2]/ul/li[3]/span[2]/text()')
        q_scjy = ''.join(q_scjy)
        q_fwlx = html.xpath('//*[@id="introduction"]/div/div/div[2]/div[2]/ul/li[5]/span[2]/text()')
        q_fwlx = ''.join(q_fwlx)
        q_dyxx = html.xpath('//*[@id="introduction"]/div/div/div[2]/div[2]/ul/li[7]/span[2]/text()')
        q_dyxx = ''.join(q_dyxx)
        q_fghy = html.xpath('//*[@id="introduction"]/div/div/div[2]/div[2]/ul/li[9]/span[2]/text()')
        q_fghy = ''.join(q_fghy)
        q_jyqs = html.xpath('//*[@id="introduction"]/div/div/div[2]/div[2]/ul/li[2]/span[2]/text()')
        q_jyqs = ''.join(q_jyqs)
        q_fwyt = html.xpath('//*[@id="introduction"]/div/div/div[2]/div[2]/ul/li[4]/span[2]/text()')
        q_fwyt = ''.join(q_fwyt)
        q_cqss = html.xpath('//*[@id="introduction"]/div/div/div[2]/div[2]/ul/li[6]/span[2]/text()')
        q_cqss = ''.join(q_cqss)
        q_fbbj = html.xpath('//*[@id="introduction"]/div/div/div[2]/div[2]/ul/li[8]/span[2]/text()')
        q_fbbj = ''.join(q_fbbj)
        
        q_hxmd = html.xpath('/html/body/div[7]/div[1]/div[2]/div/div[2]/div[2]/text()')
        q_hxmd = ''.join(q_hxmd)
        q_xqjs = html.xpath('/html/body/div[7]/div[1]/div[2]/div/div[3]/div[2]/text()')
        q_xqjs = ''.join(q_xqjs)
        q_hxjs = html.xpath('/html/body/div[7]/div[1]/div[2]/div/div[4]/div[2]/text()')
        q_hxjs = ''.join(q_hxjs)
        q_syrq = html.xpath('/html/body/div[7]/div[1]/div[2]/div/div[5]/div[2]/text()')
        q_syrq = ''.join(q_syrq)

        db.insert({"id_":url_id,"标题":q_title,"小区名":q_name,"总售价":q_sale_all,"每平单价":q_sale_m,"区域":q_area,"街道":q_area_2,"房屋户型":q_hx,"建筑面积":q_jzmj,"套内面积":q_tnmj,"房屋朝向":q_cx,"装修情况":q_zx,"电梯":q_dt,"所在楼层":q_lc,"户型结构":q_hxjg,"建筑类型":q_jzlx,"建筑结构":q_jzjg,"梯户比例":q_htbl,"挂牌时间":q_gpsj,"上次交易":q_scjy,"房屋年限":q_fwlx,"抵押信息":q_dyxx.strip(),"房管局核验码":q_fghy,"交易权属":q_jyqs,"房屋用途":q_fwyt,"产权所属":q_cqss,"房本备件":q_fbbj,"核心卖点":q_hxmd.strip(),"小区介绍":q_xqjs.strip(),"户型介绍":q_hxjs.strip(),"适宜人群":q_syrq.strip(),"页面链接":inurl, "采集日期":now, "状态":"在售", "更新时间":now})
    
    else:
        pass


#q_all_area = ['jiangan', 'jianghan', 'qiaokou', 'dongxihu', 'wuchang', 'qingshan', 'hongshan', 'hanyang', 'donghugaoxin', 'jiangxia', 'caidian', 'huangbei', 'xinzhou', 'zhuangkoukaifaqu', 'hannan']
q_all_area = ['donghugaoxin', 'jiangxia', 'qingshan', 'hongshan', 'wuchang', 'dongxihu', 'hanyang']

for qa in q_all_area:
    # 获取各区域小区列表页数
    print(qa)
    inurl_page = 'https://wh.lianjia.com/xiaoqu/%s/'%(qa)
    print(inurl_page)
    page_numbers = get_page_number(inurl_page)    # 获取当前条件下总计页码数
    print(page_numbers)
    
    # 根据页面数,获取具体小区URL编号
    for pg in range(1,page_numbers):
        inurl_list = 'https://wh.lianjia.com/xiaoqu/%s/pg%s/'%(qa,pg)
        print(inurl_list)
        xq_urls = get_xq_url(inurl_list)    # 获取当前页面出售二手房信息URL
        
        
        for inurl in xq_urls:
            xq_id = inurl.split('/')[-2]
            inurl_list = 'https://wh.lianjia.com/ershoufang/c%s/'%(xq_id)
            print(inurl_list)
         
            # all_esf_url = get_esf_url(inurl_list)
            # esf_url_list = all_esf_url[0]
            esf_nb = get_esf_url_nb(inurl_list)
            
            for nb in range(1,esf_nb):
                inurl = 'https://wh.lianjia.com/ershoufang/pg%sc%s/'%(nb,xq_id)                
                all_esf_urls = get_esf_url(inurl)
                print(inurl)
                
                for esf_url in all_esf_urls:
                    url_id = esf_url.split('/')[-1].split('.')[0]
                    
                    # 通过查询url id数量是否大于0来判断表中有没有该条数据,有则跳过,否则就获取页面信息
                    if db.find({'id_':url_id}).count() > 0:
                        print(url_id, ' exists~')
                    else:
                        print(esf_url)
                        gethtml(esf_url, url_id)

最后更新于