经过了几天的摸索,照猫画虎的把爬虫的部分做完了。
但是很多原理性的东西都不是很理解,就是照着抄的,还需要继续学习。
看这个目录结构,只看.py的文件,.pyc的文件是运行的时候生成的不管它。
items.py:定义想要导出的数据
Pipelines.py:用于将数据导出
settings.py:告诉程序数据传输需要的文件
init.py:没用到过还
(以上是我自己暂时的理解)
我最终抓取的是智联招聘的信息,注意点都在注释里
获取职位连接的关键代码,linkspider,其他文件都是生成时默认的
#encoding:utf-8from scrapy.spider import BaseSpiderfrom scrapy.http import FormRequest, Requestfrom scrapy.selector import HtmlXPathSelectorimport osimport sysimport datetimeimport reclass ZhaoPinSpider(BaseSpider): name = "zhaopin" allowed_domains = ["zhaopin.com"] #url加上pd=1就只显示今天新添加的职位 城市后面都固定为全国 zlzp_urlpatten = "http://sou.zhaopin.com/jobs/searchresult.ashx?pd=1&jl={CITY}&kw={KEYWORD}&p={CURR_PAGE}" def __init__(self): self.headers = { 'User-Agent':'Mozilla/5.0 (Windows NT 6.3; WOW64; rv:41.0) Gecko/20100101 Firefox/41.0',\ 'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',\ 'Accept-Language':'zh-CN,zh;q=0.8,en-US;q=0.5,en;q=0.3',\ 'Connection':'keep-alive' } self.start_urls = self.set_url() #set_url方法动态设定要抓取的链接列表 def set_url(self): url_list = [] #从配置文件中取出所有的关键字列表,逐个检索 keys = '大数据,hadoop,hive,hbase,spark,storm,sqoop,pig' for keyword in keys.split(','): url = self.zlzp_urlpatten url = url.format(CITY='全国', KEYWORD=keyword, CURR_PAGE=1) url_list.append(url) return url_list def start_requests(self): #该方法必须返回一个可迭代对象(iterable)。该对象包含了spider用于爬取的第一个Request。 for url in self.start_urls: yield FormRequest(url,\ headers = self.headers,\ callback = self.parse)#使用回调函数 def parse(self, response): hxs = HtmlXPathSelector(response) keyword = hxs.select('//div[@class="search"]//input[@name="KeyWord"]/@value').extract()[0] keyword = keyword.encode('utf-8') url = self.zlzp_urlpatten #找总页数 pageInfo = hxs.select('//div[@class="pagesDown"]//button/@onclick').extract() if pageInfo: #注意 只有一页时找不到pageInfo pageInfo = pageInfo[0] pattern = re.compile('.*?value,(.*?),.*', re.S) findPageNum = re.search(pattern, pageInfo) pageNum = int(findPageNum.group(1)) else: pageNum = 1 for curPage in range(1,pageNum + 1): each_url = url.format(CITY='全国', KEYWORD=keyword, CURR_PAGE=curPage) yield Request(each_url,callback=self.get_joburls_bypage) def get_joburls_bypage(self, response): hxs = HtmlXPathSelector(response) links = hxs.select('//td[@class="zwmc"]//a/@href').extract() # 获得的信息都是当天的 直接入库 for link in links: if(link != 'http://e.zhaopin.com/products/1/detail.do'): #有的地方会冒出这个链接 去掉 open('../output/link_output/link.txt', 'ab').write(link+'\n')
获取具体职位信息的page相关代码:
settings
# Scrapy settings for zhaopin_page project## For simplicity, this file contains only the most important settings by# default. All the other settings are documented here:## http://doc.scrapy.org/topics/settings.html#BOT_NAME = 'zhaopin_page'BOT_VERSION = '1.0'SPIDER_MODULES = ['zhaopin_page.spiders']NEWSPIDER_MODULE = 'zhaopin_page.spiders'USER_AGENT = '%s/%s' % (BOT_NAME, BOT_VERSION)ITEM_PIPELINES = { 'zhaopin_page.FilePipelines.PagePipeline':5}
FilePipelines
# encoding: utf-8import tracebackimport datetimeimport sysreload(sys)sys.setdefaultencoding( "utf-8" )sys.path.append("../../../")class PagePipeline(object): #把解析后的内容放入文件中 def process_item(self, item, spider): fname = '../output/page_output/' + item['file_id'] + '.txt' try: outfile = open(fname, 'wb') outfile.write(item['web_id']+self.getJobFieldSpt()+item['job_url']+self.getJobFieldSpt()+item['job_name']+self.getJobFieldSpt()+item['job_location']+self.getJobFieldSpt()+item['job_desc']+self.getJobFieldSpt()+item['edu']+self.getJobFieldSpt()+item['gender']+self.getJobFieldSpt()+item['language']+self.getJobFieldSpt()+item['major']+self.getJobFieldSpt()+item['work_years']+self.getJobFieldSpt()+item['salary']+self.getJobFieldSpt()+item['company_name']+self.getJobFieldSpt()+item['company_desc']+self.getJobFieldSpt()+item['company_address']+self.getJobFieldSpt()+item['company_worktype']+self.getJobFieldSpt()+item['company_scale']+self.getJobFieldSpt()+item['company_prop']+self.getJobFieldSpt()+item['company_website']+self.getJobFieldSpt()+self.getCurrentTimestamp()) except Exception as e: print "ERROR GEN FILE!! >>> " + fname print traceback.format_exc() def getCurrentTimestamp(self): # 得到时间戳 return datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S') def getJobFieldSpt(self): #得到生成的职位文件字段间的分隔符。使用ascii码1,和hive中默认的分隔符相同 return chr(1)
items
# encoding: utf-8from scrapy.item import Item, Field#定义存放帖子内容的类class PageItem(Item): #网站标识 web_id = Field() #生成的文件名 file_id = Field() #职位来源网址 job_url = Field() #工作名称 job_name = Field() #工作地点 job_location = Field() #职位描述 job_desc = Field() #学历要求 edu = Field() #性别要求 gender = Field() #语言要求 language = Field() #专业要求 major = Field() #工作年限 work_years = Field() #薪水范围 salary = Field() #职位发布时间 job_datetime = Field() #公司名称 company_name = Field() #企业介绍 company_desc = Field() #公司地址 company_address = Field() #行业 company_worktype = Field() #规模 company_scale = Field() #性质 company_prop = Field() #网址 company_website = Field()
spider
# encoding: utf-8from scrapy.spider import BaseSpiderfrom scrapy.http import FormRequest, Requestfrom scrapy.selector import HtmlXPathSelectorfrom zhaopin_page import itemsimport tracebackimport sysimport datetimeimport re#定义要抓取页面的爬虫类class ZhaoPinPageSpider(BaseSpider): name = "page" start_urls = [] def __init__(self): self.start_urls = self.set_url() #从jobs_task表中读出要抓取的链接列表,放入数组中 def set_url(self): url_list = [] link_file = open('../output/link_output/link.txt', 'r') loops = 0 for each_link in link_file: each_link = each_link.replace('\r','') each_link = each_link.replace('\n','') url_list.append(each_link) loops+=1 if (loops == 100): break link_file.close() return url_list def parse(self, response): try: #url中后面的数字串 file_id = response.url.split("/")[-1].split(".")[0] hxs = HtmlXPathSelector(response) #获取最上面一栏的内容 title = '' #职位名 companyName = '' #公司名 basicInfo = hxs.select('//div[@class="fixed-inner-box"]').extract()[0] #会有两个 后面有个clone的 pattern = re.compile('.*?(.*?)
.*?(.*?).*?', re.S) findBasicInfo = re.search(pattern, basicInfo) if findBasicInfo: title = findBasicInfo.group(1).strip() #职位名 companyName = findBasicInfo.group(2).strip() #公司名 #获取左侧基本公司信息 不能用正则表达式,因为有信息不全的情况 如http://jobs.zhaopin.com/297851037250005.htm companySize = '' #公司规模 companyType = '' #公司性质 companyLine = '' #公司行业 companyHost = '' #公司主页 companyAddress = '' #公司地 companyInfo = hxs.select('//div[@class="company-box"]').extract()[0].encode('utf-8') #尽管只有一个,但是是列表形式,还是需要取出来 if(companyInfo.find('公司规模:')>-1): companySize = companyInfo.split('公司规模:')[1] companySize = companySize.split(' ')[1] companySize = companySize.split('')[0].strip() if(companyInfo.find('公司性质:')>-1): companyType = companyInfo.split('公司性质:')[1] companyType = companyType.split(' ')[1] companyType = companyType.split('')[0].strip() if(companyInfo.find('公司行业:')>-1): companyLine = companyInfo.split('公司行业:')[1] companyLine = companyLine.split(' ')[1] companyLine = companyLine.split('')[0] companyLine = companyLine.split('>')[1].strip() if(companyInfo.find('公司主页:')>-1): companyHost = companyInfo.split('公司主页:')[1] companyHost = companyHost.split('')[1] companyHost = companyHost.split('')[0] companyHost = companyHost.split('>')[1].strip() if(companyInfo.find('公司地址:')>-1): companyAddress = companyInfo.split('公司地址:')[1] companyAddress = companyAddress.split('')[1] companyAddress = companyAddress.split('')[0].strip() #获取中部工作要求信息 salary = '' #职位月薪 必须先声明变量 否则会出错 address = '' #工作地点 jobDateTime = '' #发布日期 jobCategory = '' #工作性质 experience = '' #工作经验 education = '' #最低学历 numberInNeed = '' #招聘人数 jobType = '' #职位类别 jobRequirementInfo = hxs.select('/html/body/div[4]/div[1]/ul').extract()[0] pattern = re.compile('.*?(.*?)\.*?.*? (.*?)\.*? .*? (.*?)\.*? (.*?)\.*? (.*?)\.*? (.*?)\.*? (.*?)\.*? .*?target.*?>(.*?)',re.S) #前面不能有空格或者TAB 否则匹配不上 findJobRequirementInfo = re.search(pattern, jobRequirementInfo) if findJobRequirementInfo: salary = findJobRequirementInfo.group(1).strip() #职位月薪 address = findJobRequirementInfo.group(2).strip() #工作地点 jobDateTime = findJobRequirementInfo.group(3).strip() #发布日期 jobCategory = findJobRequirementInfo.group(4).strip() #工作性质 experience = findJobRequirementInfo.group(5).strip() #工作经验 education = findJobRequirementInfo.group(6).strip() #最低学历 numberInNeed = findJobRequirementInfo.group(7).strip() #招聘人数 jobType = findJobRequirementInfo.group(8).strip() #职位类别 #获取描述信息 detailInfo = hxs.select('//div[@class="tab-inner-cont"]').extract() jobDescribe = detailInfo[0] companyDescribe = detailInfo[1] pattern = re.compile('<.*?>| ',re.S) #删除无用的信息 jobDescribe = re.sub(pattern,'',jobDescribe).strip() #职位描述 companyDescribe = re.sub(pattern,'',companyDescribe).strip() #公司介绍 companySize = re.sub(pattern,'',companySize).strip() companyType = re.sub(pattern,'',companyType).strip() companyLine = re.sub(pattern,'',companyLine).strip() companyHost = re.sub(pattern,'',companyHost).strip() companyAddress = re.sub(pattern,'',companyAddress).strip() salary = re.sub(pattern,'',salary).strip() address = re.sub(pattern,'',address).strip() jobDateTime = re.sub(pattern,'',jobDateTime).strip() jobCategory = re.sub(pattern,'',jobCategory).strip() experience = re.sub(pattern,'',experience).strip() education = re.sub(pattern,'',education).strip() numberInNeed = re.sub(pattern,'',numberInNeed).strip() jobType = re.sub(pattern,'',jobType).strip() title = re.sub(pattern,'',title).strip() companyName = re.sub(pattern,'',companyName).strip() data = items.PageItem() data['web_id'] = "zhaopin" data['file_id'] = file_id data['job_url'] = response.url data['job_name'] = title data['job_desc'] = jobDescribe data['gender'] = "" data['major'] = "" data['company_name'] = companyName data['job_datetime'] = jobDateTime data['job_location'] = address data['work_years'] = experience data['edu'] = education data['salary'] = salary data['company_desc'] = companyDescribe data['company_address'] = companyAddress data['company_website'] = companyHost data['language'] = "" data['company_worktype'] = companyLine data['company_prop'] = companyType data['company_scale'] = companySize #更新任务表中抓取状态 #self.jobsTool.updateCrulInfo(ConfigPropObj.liepin_webid, response.url, 1, "") return data except Exception as e: print "ERROR PARSE" print response.url print traceback.format_exc()