爬虫工具箱:InfoSpider

InfoSpider 一个集众多数据源于一身的爬虫工具箱,提供数据分析功能,基于用户数据生成图表文件,使得用户更直观、深入了解自己的信息。目前支持的数据源有:GitHub、QQ 邮箱、网易邮箱、阿里邮箱、新浪邮箱、Hotmail 邮箱、Outlook 邮箱、京东、淘宝、支付宝、中国移动、中国联通、中国电信、知乎、哔哩哔哩、网易云音乐、QQ 好友、QQ 群、生成朋友圈相册、浏览器浏览历史、12306、博客园、CSDN 博客、开源中国博客、简书。GitHub 地址→https://github.com/kangvcar/InfoSpider

Python最简单HTTP服务器与MVC

socket模拟HTTP服务基于socket开发,监听127.0.0.1任一端口,如:8888,接收监听到的数据,并通过conn以HTTP响应的格式返回数据import socketsock = socket.socket()sock.bind(("localhost", 8888)) # 绑定监听的IP地址与端口8800sock.listen(5) # 设置最大监听数while True: conn, addr = sock.accept() data = conn.recv(1024) print(data) # 打印查看请求头与请求体 # 必须以HTTP响应头的格式返回数据,否则浏览器无法正常解析 # 同时注意send的数据不能是str字符串,必须是bytes,否则会报错。 conn.send(b'HTTP/1.1 200 OK\r\n\r\n <h1>Hello world!</h1>') conn.close()浏览器访问localhost:8800,即可看到网页结果,http response中响应体前必须有两空行’ \r\n\r\n’ 否则会被认为是响应头的内容https://blog.csdn.net/qq_29941979/article/details/107871763˂a name=利用Tornado库 href="#"˃利用Tornado库my.pyimport tornado.ioloopimport tornado.web#访问地址 http://127.0.0.1:9870/main?ywdm=06&num1=10&num2=200class TestClassA: def sub(self,a,b): return a-b def add(self,a,b): return a+b def chen(self,a,b): return a*bclass TestClassB: def sub(self,a,b): return a-b def add(self,a,b): return a+b def chen(self,a,b): return a*bsys_config={}sys_config["01"]=['mymvc','TestClassA','add']sys_config["02"]=['mymvc','TestClassA','sub']sys_config["03"]=['mymvc','TestClassA','chen']sys_config["04"]=['mymvc','TestClassB','add']sys_config["05"]=['mymvc','TestClassB','sub']sys_config["06"]=['mymvc','TestClassB','chen']class MainHandler(tornado.web.RequestHandler): def get(self): ywdm=self.get_argument('ywdm') num1=int(self.get_argument('num1').encode('utf-8')) num2=int(self.get_argument('num2').encode('utf-8')) message="hello !" print type(num2) if ywdm in sys_config: my_module_name=sys_config[ywdm][0] my_class_name=sys_config[ywdm][1] my_method_name=sys_config[ywdm][2] my_module = __import__(my_module_name) my_class = getattr(my_module,my_class_name) my_obj = my_class() my_method = getattr(my_obj,my_method_name) ret=my_method(num1,num2) print "ret:",ret #message = "ret data:::"+ret message="ywdm:"+ywdm+"-data:"+str(ret) items = ["Item 1", "Item 2", "Item 3"] self.render("test.html", title="My title", items=items,config_items=sys_config) #self.write(message) #self.finish()application = tornado.web.Application([(r"/main", MainHandler),])if __name__ == "__main__": application.listen(9870) tornado.ioloop.IOLoop.instance().start()test.html<html> <head> <title>{{ title }}</title> </head> <body> <ul> {% for item in items %} <li>{{ escape(item) }}</li> {% end %} </ul> <ul> {% for item in config_items %} <li>{{ escape(config_items[item][0]) }} -{{ escape(config_items[item][1]) }} -{{ escape(config_items[item][2]) }} </li> {% end %} </ul> </body> </html>https://blog.csdn.net/5iasp/article/details/23267609其中py中application中为数组形式,可以加多条映射,返回结果可以返回渲染后模板也可以返回字符串,例如:import tornado.ioloopimport tornado.webimport ctypesclass MainHandler(tornado.web.RequestHandler): def get(self): ip = self.get_argument('ip') print(ip) self.write("成功") self.finish()class IndexHandler(tornado.web.RequestHandler): def get(self): items = [] self.render("ip.html", title="My title", items=items)application = tornado.web.Application([(r"/index/get", MainHandler),(r"/index", IndexHandler) ])if __name__ == "__main__": # 最小化程序 ctypes.windll.user32.ShowWindow(ctypes.windll.kernel32.GetConsoleWindow(), 6) application.listen(9870) tornado.ioloop.IOLoop.instance().start()

pycharm安装依赖包过慢解决方法

简单,在terminal中输入如下命令即可,例如安装requests:pip3 install requests -i http://pypi.douban.com/simple --trusted-host pypi.douban.com其他国内源参看:https://cway.top/post/831.html

Playwright实现多端发帖

Playwright实现多端发帖一直想做,但是抓包太麻烦了,刚好接触到这个工具,一想其实用途还挺多,可以自己录个多平台发文脚本。例如写篇文章,想同步在简书、zblog、hu60、v2ex等平台发布,这几个平台刚好都支持markdown。那么我先根据路径读取md文件,以文件名为标题,文本为内容再执行各个发布方法,但在此之前,我们先获取登陆后的浏览器信息并保存吧from playwright import sync_playwrightdef hu60(playwright, name, pwd): browser = playwright.chromium.launch(headless=False) context = browser.newContext() page = context.newPage() page.goto("https://hu60.cn/q.php/user.login.html?u=index.index.html") page.fill("input[name=\"name\"]", name) page.fill("input[name=\"pass\"]", pwd) page.click("input[name=\"go\"]") page.close() # 保存浏览器数据至hu60文件 方便发布脚本读取 context.storageState(path="hu60") context.close() browser.close()def zblog(playwright, name, pwd): browser = playwright.chromium.launch(headless=False) context = browser.newContext() page = context.newPage() page.goto("https://cway.top/zb_system/login.php") page.fill("input[name=\"edtUserName\"]", name) page.fill("input[name=\"edtPassWord\"]", pwd) page.close() context.storageState(path="cway") context.close() browser.close()with sync_playwright() as playwright: hu60(playwright, '帐号', '密码') zblog(playwright, '帐号', '密码')接着我们可以直接执行发布脚本了,键入文本路径即可from playwright import sync_playwrightdef zblog(playwright, title, content, id): import time browser = playwright.chromium.launch(headless=False) context = browser.newContext(storageState="cway") page = context.newPage() page.goto("https://cway.top/") page.click("text=\"新建文章\"") page.click("input[name=\"Title\"]") page.fill("input[name=\"Title\"]", title) page.fill("//div[normalize-space(.)='Enjoy Markdown! coding now...​x 1​']/div[1]/textarea", content) time.sleep(1) page.selectOption("select[id=\"cmbCateID\"]", id) page.click("input[type=\"submit\"]") page.goto("https://cway.top/") text = page.innerText("body") assert title in text page.close() context.close() browser.close()def hu60(playwright, title, content, id): browser = playwright.chromium.launch(headless=False) context = browser.newContext(storageState="hu60") page = context.newPage() page.goto("https://hu60.cn/q.php/index.index.html") page.click("text=\"发帖\"") page.click("text=/.*" + id + ".*/") page.click("input[name=\"title\"]") page.fill("input[name=\"title\"]", title) page.fill("textarea[name=\"content\"]", content) page.click("input[name=\"go\"]") context.close() browser.close()def jianshu(playwright, title, content, id): import time browser = playwright.chromium.launch(headless=False) context = browser.newContext(storageState="jianshu") page = context.newPage() page.goto("https://www.jianshu.com/writer#/") page.click("text=\"" + id + "\"") with page.expect_navigation(): page.click("text=\"" + id + "\"") page.click("//span[normalize-space(.)='新建文章']") page.click("//div[normalize-space(.)='发布文章']/input[normalize-space(@type)='text']") page.fill("//div[normalize-space(.)='发布文章']/input[normalize-space(@type)='text']", title) page.click("textarea[id=\"arthur-editor\"]") page.fill("textarea[id=\"arthur-editor\"]", content) time.sleep(2) page.click("//a[normalize-space(.)='发布文章']") page.close() context.close() browser.close()with sync_playwright() as playwright: path = input('请输入md或txt文件路径或在控制台拖入文件:') file = open(path, 'r') fn = file.name.split('/') # 获取标题 title = fn[len(fn) - 1].split('.')[0] # 获取内容 content = file.read() # 分类处理 由于每个平台文章分类不一样 酌情修改 # 指定分类 cata = 'py' zbcata, hu60cata, jscata = '7', 'Java', 'Java' if cata == 'java': zbcata, hu60cata, jscata = '7', 'Java', 'Java' if cata == 'py': zbcata, hu60cata, jscata = '18', 'Python', 'Python' zblog(playwright, title, content, zbcata) hu60(playwright, title, content, hu60cata) # jianshu(playwright, title, content, jscate)

Python解决pip下载慢 使用国内源教程

国内源清华:https://pypi.tuna.tsinghua.edu.cn/simple阿里云:https://mirrors.aliyun.com/pypi/simple/中国科技大学 https://pypi.mirrors.ustc.edu.cn/simple/华中理工大学:http://pypi.hustunique.com/山东理工大学:http://pypi.sdutlinux.org/ 豆瓣:http://pypi.douban.com/simple/临时使用:可以在使用pip的时候加参数-i https://pypi.tuna.tsinghua.edu.cn/simple例如:pip install -i https://pypi.tuna.tsinghua.edu.cn/simple pyspider,这样就会从清华这边的镜像去安装pyspider库。永久修改,一劳永逸:Linux下,修改 ~/.pip/pip.conf (没有就创建一个文件夹及文件。文件夹要加“.”,表示是隐藏文件夹)内容如下:[global]index-url = https://pypi.tuna.tsinghua.edu.cn/simple[install]trusted-host=mirrors.aliyun.comwindows下,直接在user目录中创建一个pip目录,如:C:\Users\xx\pip,新建文件pip.ini。内容同上。

Python安装与配置环境变量(Windows)

官网:https://www.python.org/downloads/下载后Windows用户直接下一步下一步安装即可。安装好后配置环境变量也简单,主要讲安装目录与安装目录Scripts目录加入Path中即可。例如我的安装目录是D:\Programs\Python\Python39,那就将以下加入Path中即可:D:\Programs\Python\Python39D:\Programs\Python\Python39\Scripts

Playwright(python)浏览器脚本录制工具使用

功能:录制浏览器操作并自动生成py或js代码以下是python环境下的教程。环境要求需Python3.7+,安装:# 安装playwright库pip install playwright# 安装浏览器驱动文件(文件较大有点慢)python -m playwright install录制python -m playwright codegen其他选项:-target 生成语言,有python/javascript/python-async/csharp可选,缺省值为python-o 保存路径,也可以写成–output -b 指定浏览器,浏览器选项如下(缺省默认为chromium):cr 谷歌浏览器,或全称chromiumff 火狐浏览器,或全称firefoxwk 全称webkit-h 查看帮助,也可写成–help例如:python -m playwright codegen -h例如指令:python -m playwright codegen --target python -o 'my.py' -b chromium https://cway.top脚本代码会直接在控制台输出供你复制,或者在执行命令目录下查看my.py文件完整选项/命令:选项: -V, --version 输出版本号 -b, --browser <browserType> 浏览器类型 --color-scheme <scheme> 更改主题 取值 "light" 或 "dark" --device <deviceName> 模拟设备,例如 "iPhone 11" --geolocation <coordinates> 指定地理位置 例如 "37.819722,-122.478611" --lang <language> 指定语言区域 "en-GB" --save-storage <filename> 保存浏览器状态到指定文件 --load-storage <filename> 载入指定文件浏览器状态 --proxy-server <proxy> 指定代理服务器 例如 "http://myproxy:3128" 或 "socks5://myproxy:8080" --timezone <time zone> 失去设置 例如 "Europe/Rome" --timeout <timeout> 超时设置,单位毫秒 (default: "10000") --user-agent <ua string> 指定UA --viewport-size <size> 指定浏览器像素 "1280, 720"命令: open [url] 打开URL或用-b, --browser指定浏览器 cr [url] 打开URL用Chromium ff [url] 打开URL用Firefox wk [url] 打开URL用WebKit codegen [options] [url] 打开页面生成代码 screenshot [options] <url> <filename> 页面截图 pdf [options] <url> <filename> 保存页面为pdf install 确保安装必要的浏览器驱动 help [command] 帮助–save-storage与–load-storage是个非常实用的命令,例如用下面命令访问网站并登陆,关闭浏览器时自动把cookie等浏览器信息存入hik文件中:python -m playwright cr https://cway.top --save-storage cway使用时用下述命令直接调用,打开页面即为登陆状态的hu60:python -m playwright cr https://cway.top --load-storage cway假如我有多个网站帐号就可以存在多个不同文件,使用时输入命令即可,文件默认储存在当前执行命令的目录在网站录制操作的过程中也可以用–sava,例如:python -m playwright codegen --target python -o 'login.py' https://cway.top --save-storage cway这样py代码中也生成了保存信息到本地的功能代码,适合于更新帐号信息,然后录制操作只用录制登陆后的页面即可,如下命令,直接读取已登陆的状态,然后就能在已登陆状态下录制:python -m playwright codegen --target python -o 'run.py' https://cway.top --load-storage cway开源地址:https://github.com/microsoft/playwrighthttps://github.com/microsoft/playwright-python

Python打包windows exe/linux可执行文件方法

方法一:pyinstallerpyinstaller适合Windows与Linux1、安装pyinstaller,执行命令pip install pyinstaller2、打包pyinstaller -F -c main.py生成带图标的使用-Ipyinstaller -i xxx.ico hello.py在当前目录下的 dist 文件夹内可以找到生成后的可执行文件方法二:py2exe只适合Windows平台1、安装py2exepip install py2exe2、使用build_exe main.py经测试python3.6版本以上会报错缺少模块,请降级或安装模块。因此个人建议还是方法一吧。问题解答Windows系统若Python未加入环境变量如何使用?直接进python安装目录,进入Scripts文件夹,在资源管理器地址栏输入cmd,输入pip.exe install pyinstaller即可安装,并且安装文件也在Scripts文件夹里,输入pyinstaller.exe -F -c C:\Users\Administrator\Desktop\pythonProject\main.py即可

Python中Excel读取写入(xlrd与xlwt)的使用

xlrd读取Excel(仅支持xls格式)import xlrddata=xlrd.open_workbook("excel.xls")# # 判断0工作表是否被加载# print(data.sheet_loaded(0))# # 卸载工作表后再输出为false表示未加载# data.unload_sheet(0)# print(data.sheet_loaded(0))# 打印所有sheet 与 索引为1的sheetprint(data.sheets())print(data.sheets()[0])# 根据索引名字获取 Sheet 0:<Sheet1>print(data.sheet_by_index(0))print(data.sheet_by_name("Sheet1"))# 获取所有工作表名字 ['Sheet1', 'Sheet2', 'Sheet3']print(data.sheet_names())# 获取sheets总数 3print(data.nsheets)# 操作行列sheet=data.sheet_by_index(0)# 获取总行数print(sheet.nrows)# 获取索引行内容print(sheet.row(0))# 获取行类型 array('B', [1, 0, 0, 0]) 数字表示类型print(sheet.row_types(0))# 第一行第一个单元格print(sheet.row(0)[0])print(sheet.row(0)[0].value)# 纯获取行直print(sheet.row_values(0))# 行长度print(sheet.row_len(0))# 列数print(sheet.ncols)print(sheet.col(0))# 获取指定单元格值print(sheet.col(0)[0].value)# 列所有值 类型print(sheet.col_values(0))print(sheet.col_types(0))# 操作单元格 坐标行列print(sheet.cell(0,0))print(sheet.cell(0,0).value)print(sheet.cell_value(0,0))# 查看单元格类型 ctype亦可print(sheet.cell_type(0,0))print(sheet.cell(0,0).ctype)xlwt写出Excelimport xlwt# excel写入(xlwt) 创建工作簿# 初始化样式titlestyle = xlwt.XFStyle()titlefont = xlwt.Font()titlefont.name = "宋体"titlefont.bold = Truetitlefont.height = 11 * 20 # 11字号 20单位titlefont.colour_index = 0x08titlestyle.font = titlefont# 对齐方式cellalign = xlwt.Alignment()# 水平居中cellalign.horz = 0x02# 垂直居中cellalign.vert = 0x01titlestyle.alignment = cellalign# 边框borders = xlwt.Borders()borders.right = xlwt.Borders.DASHEDborders.bottom = xlwt.Borders.DASHEDtitlestyle.borders = borderswb = xlwt.Workbook()# 背景颜色样式title1style = xlwt.XFStyle()# 颜色模式bgcolor = xlwt.Pattern()bgcolor.pattern = xlwt.Pattern.SOLID_PATTERN# 设置颜色bgcolor.pattern_fore_colour = 22title1style.pattern = bgcolor# 创建sheetws = wb.add_sheet("2020年度")# 填充数据 参数:r1 r2 c1 c2 文字,即行列范围合并并填充文字ws.write_merge(0, 1, 0, 5, "2020年报表", titlestyle)# 写入数据data = (("title1", "title2", "title3", "title4", "title5"), ("1", "2", "3", "4", "5"), ("5", "4", "3", "2", "1"))# enumerate循环返回索引for i, item in enumerate(data): for j, val in enumerate(item): if j == 0: ws.write(i + 2, j, val, title1style) else: # 跳过前两行 ws.write(i + 2, j, val)# 插入第二个sheet 添加图片wsimg = wb.add_sheet("img")# 插入图片wsimg.insert_bitmap("2020.bmp", 0, 0)# 写入到当前目录wb.save("2020_BG.xls")

Python从excel模板中替换字符串并生成新文件

普通替换,不保留样式# Open Excel file from a user imputimport xlrd, xlwtfilename = raw_input("Enter Excel file name with extension (.xls) and path")oldbook = xlrd.open_workbook(filename)newbook = xlwt.Workbook()# For all the sheets in the workbookfor sheetname in oldbook.sheet_names(): oldsheet = oldbook.sheet_by_name(sheetname) newsheet = newbook.add_sheet(sheetname) # For all the rows and all the columns in an excel for ii in range(oldsheet.nrows): for jj in range(oldsheet.ncols): # Replace CellString=str(oldsheet.cell(ii, jj).Value) CellString=CellString.replace("%", "Perc") CellString=CellString.replace(" ", "_") newsheet.write(ii, jj, CellString)# Save the file in a desired location with the desired namesavelocation = raw_input("Enter a new path and file name with extension (.xls) to save the new Excel spread sheet ")newbook.save(savelocation)运用xlutils可保留样式import xlrdfrom xlutils.filter import process, XLRDReader, XLWTWriterrb = xlrd.open_workbook('excel.xls', formatting_info=True)# 参考xlutils.copy库内的用法 参考xlutils.filter内的参数定义style_listw = XLWTWriter()process(XLRDReader(rb, 'unknown.xls'), w)wb = w.output[0][1]style_list = w.style_listpeple = ("张三", "李四", "王五")for i in range(len(peple)): for j in range(i): left = peple[i] right = peple[j] for n, sheet in enumerate(rb.sheets()): sheet2 = wb.get_sheet(n) for r in range(sheet.nrows): for c, cell in enumerate(sheet.row_values(r)): style = style_list[sheet.cell_xf_index(r, c)] val = sheet.cell_value(r, c) if val == "{left}": val = left elif val == "{right}": val = right sheet2.write(r, c, val, style) # print(str(i) + str(j)+left+right) wb.save('比赛(' + left + "vs" + right + ').xls')