[自言自语] 给自己的 Kindle 写了个纽约时报(我也不知道叫什么，爬虫？工具？下载器？)

V2EX = way to explore

V2EX 是一个关于分享和探索的地方

已注册用户请登录

这是一个创建于 2466 天前的主题，其中的信息可能已经有所发展或是发生改变。

代码仅作纪念，所有权利保留喔

实际使用时还需要搭配 calibre 转换格式，不能使用 Email 直接发送 html 文件

#!/usr/bin/env python3 # coding = utf-8 import requests, sys, time, logging, tempfile from bs4 import BeautifulSoup DEBUG = False class URLSet: 'URL ，取、理、提供 URL' def getAnyOnePost(self): return 'https://cn.nytimes.com/china/20190118/china-population-crisis/' def getDualPostURL(self, url): return 'https://cn.nytimes.com/china/20190118/china-population-crisis/dual/' @staticmethod def getTestURL(): return 'https://cn.nytimes.com/asia-pacific/20190117/china-canada-schellenberg-death/dual/' @staticmethod def getalot(): result = [] strs = """https://cn.nytimes.com/china/20190118/china-population-crisis/dual/ https://cn.nytimes.com/asia-pacific/20190118/north-korea-missile-kim-jong-un/dual/ https://cn.nytimes.com/technology/20190118/huawei-investigation-trade-secrets/dual/ https://cn.nytimes.com/asia-pacific/20190117/china-canada-schellenberg-death/dual/ https://cn.nytimes.com/china/20190117/asian-cup-china-tattoos/dual/ https://cn.nytimes.com/usa/20190116/trump-inauguration-spending/dual/ https://cn.nytimes.com/opinion/20190118/germanys-china-problem/dual/ https://cn.nytimes.com/travel/20190118/what-to-do-in-rome-36-hours/dual/ https://cn.nytimes.com/opinion/20190116/will-chinas-economy-hit-a-great-wall/dual/ https://cn.nytimes.com/style/20190115/modern-love-end-of-marriage-google-maps/dual/ https://cn.nytimes.com/opinion/20190115/us-china-trade/dual/ https://cn.nytimes.com/asia-pacific/20190118/philippines-subic-bay-shipyard/dual/ https://cn.nytimes.com/education/20190117/the-gender-achievement-gap-starts-later-for-asian-american-students/dual/ https://cn.nytimes.com/style/20190117/slice-joint-pizza-new-york-city/dual/ https://cn.nytimes.com/culture/20190115/wod-furlough/""" strs = strs.split('\n') for i in strs: result.append(i) return result class Spaker: '格式化的文章用目格式封' @staticmethod def saveAsHTML(post, filename): if filename is None: filename = "temByNY2PDF.py" with open('{}.html'.format(filename), 'w') as f: f.write(post) class newsProcessor: '取新 URL 集，理成合的格式送入 Spaker 保存文件以便 Kindle ' def __init__(self, url): self._url = url self._doc = webCommissioner.getHTMLDoc(self._url) def getContentStr(self): soup = BeautifulSoup(self._doc, "lxml") result = {'title': None, 'enTitle': '', 'date': None, 'contents': [], 'contentStr': ''} result['title'] = soup.find('meta', property='og:title')['content'] result['enTitle'] = soup.find('h1', {'class': 'en-title'}).string result['date'] = soup.find('meta', id='date')['content'] for p in soup.find_all('div', {'class': 'article-paragraph'}): if p is None: continue result['contents'].append(p.get_text(strip=True)) result['contentStr'] = result['contentStr'] + ("{}\n".format(result['contents'][-1])) return result class webCommissioner: '路相的一切工作，下 html 、格式化 URL，等等' @staticmethod def getHTMLDoc(url): html = requests.get(url, headers={ 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36'}, timeout=2) return html.text class post: def __init__(self, postDict): self.title = postDict['title'] self.enTitle = postDict['enTitle'] self.date = postDict['date'] self.cOntents= postDict['contents'] self.cOntentsStr= postDict['contentStr'] str = '' for p in self.contents: str = str + "<p>{}</p>\n".format(p) self.htmldoc = """<!DOCTYPE html> <html lang="zh-cmn-Hans-CN"> <head> <meta charset="UTF-8"> <meta http-equiv="Content-Language" cOntent="zh-cmn-Hans-CN"/> <meta name="author" cOntent="Nytimes"/> <title>{}</title> </head> <body> <h1>{}</h1> <h1 class="en-title">{}</h1> {} </body> </html>""".format(self.title, self.title, self.enTitle, str) if __name__ == '__main__': for url in URLSet.getalot(): pro = newsProcessor(url) pst = post(pro.getContentStr()) Spaker.saveAsHTML(pst.htmldoc, pst.title)

目前尚无回复

self result URL def