
webrpa 是一个分布式的网络爬虫系统,基于 fastapi+fastadmin 开发,通过 web api 接口发起网络爬虫服务,实现流程自动化或数据自动抓取。它包含两部分:
graph LR client-->manager-->worker1 manager-->worker2 manager-->workers[worker...] 主要实现的功能包括:
引入 browser use ,通过 LLM 自动创建数据爬虫服务。
{ "name": "szreorc", "desc": "深圳不动产查询", "driver": "firefox", "url": "", "debug": true, "window_size": "1920x1080", "action_timeout": 5, "wait_redirect": true, "wait_redirect_interval": 2, "identifier": "{username}-{BuildingName}-{UNIT_NO}", "credential": "{username}", "actions": { "1": { "desc": "确认登录", "action": "check_variable", "options": {"script": "return window.location.href;", "target": "^https://pnr.sz.gov.cn/d-ghrer/reroosp/ytcf" } }, "10": { "desc" : "用户名密码登录", "action": "click", "timeout": 2, "target": ["xpath", "//a[contains(@class, 'login-tab') and normalize-space(text())='账号密码']"] }, "11": { "desc" : "输入用户名", "action": "input_text", "target": ["xpath", "//input[@type='text' and @placeholder='请输入账号']"], "param": "username" }, "12": { "desc": "增加计数", "action": "variable", "options": {"variable":"counter1","operator": "+"} }, "13": { "desc": "检测计数", "action": "variable", "stop_on_fail": true, "options": {"variable":"counter1","operator": "<", "target": 2, "sleep": 2000} }, "14": { "desc" : "输入密码", "action": "input_text", "target": ["xpath", "//input[@type='password' and @placeholder='请输入密码']"], "param": "password" }, "15": { "desc": "识别 captcha", "action": "decode_captcha_code", "target": ["xpath","//div[contains(@class, 'captcha-body') and @title='点击刷新']"], "options": {"code_type": 11} }, "16": { "desc": "输入 captcha", "action": "input_text", "target": ["xpath","//div[contains(@class, 'account_verifying')] //input[@type='text']"] }, "17": { "desc": "点击登录", "action": "click", "target": ["xpath", "//button[contains(@class, 'gd-btn-primary') and contains(@class, 'gd-btn') and @type='button']//span[starts-with(text(), '登录 ')]"] }, "18": { "desc": "继续登录", "action": "click", "target": ["xpath", "//button[.//span[contains(text(), '继续登录')]]"] }, "20": { "desc": "确认选择", "action": "click", "timeout": 10, "stop_on_fail": true, "fail_message": "login failed", "options": {"set_credential": true}, "target": ["class name", "jinruxuzhi-checkbox"] }, "21": { "desc": "确认选择下一步", "action": "click", "target": ["class name", "jinruxuzhi-buttonOk"] }, "30": { "desc": "展开查询类型", "action": "click", "options": {"sleep": 2}, "target": ["xpath", "//input[@type='text' and @placeholder='请选择']"] }, "31": { "desc": "等待下拉菜单", "action": "wait_element", "options": {"visible": true}, "target": ["css selector", "div.el-select-dropdown.el-popper"] }, "32": { "desc": "选择查询类型", "action": "click", "target": ["xpath", "//li[contains(@class, 'el-select-dropdown__item') and span[text()='楼名及栋名']]"] }, "33": { "desc" : "输入查询内容", "action": "input_text", "target": ["xpath", "//input[@type='text' and @placeholder='请输入内容']"], "param": "BuildingName" }, "34": { "desc": "点击查询", "action": "click", "target": ["class name", "el-icon-search"] }, "35": { "desc": "点击截图对象", "action": "click", "timeout": 20, "stop_on_fail": true, "fail_message": "search failed", "target": ["xpath", "//div[contains(@class, 'el-dialog__wrapper')]//div[contains(@class, 'el-tabs__item') and normalize-space(text())='楼宇']"] }, "40": { "desc": "获取数据", "action": "get_data", "options": {"script": "var table = document.querySelector(\"#pane-1 table.is-bordered.el-descriptions--mini\");\nvar fields = [\"土地坐落\", \"楼名及栋名\", \"房屋类型\", \"房屋性质\", \"房屋用途\"];\nvar result = {};\nif (table) {\n var rows = table.querySelectorAll(\"tr.el-descriptions-row\");\n rows.forEach(function(row) {\n var label = row.querySelector(\"th.el-descriptions-item__label\").innerText.trim();\n var cOntent= row.querySelector(\"td.el-descriptions-item__content\").innerText.trim();\n if (fields.includes(label)) {\n result[label] = content;\n }\n });\n console.log(JSON.stringify(result));\n} else {\n console.log(\"Table not found.\");\n};\nreturn result;\n"} }, "41": { "desc": "点击截图对象", "action": "click", "target": ["xpath", "//div[contains(@class, 'el-dialog__wrapper')]//div[contains(@class, 'el-tabs__item') and normalize-space(text())='房屋']"] }, "42": { "desc": "下拉房屋查询", "action": "click", "target": ["css selector", "#pane-2 input.el-input__inner"] }, "43": { "desc": "点击房屋查询", "action": "click", "target": ["xpath", "//li[contains(@class, 'el-select-dropdown__item')]//span[text()='{UNIT_NO}']"], "param": "UNIT_NO" }, "44": { "desc": "截图", "action": "screenshot", "target": ["class name", "el-dialog__wrapper"], "options": {"visible": true} }, "45": { "desc": "获取数据", "action": "get_data", "options": {"script": "var table = document.querySelector(\"#pane-2 table.is-bordered.el-descriptions--mini\");\nvar fields = [\"房号\", \"所在楼层\", \"建筑面积\", \"使用年限\", \"存在抵押\", \"存在查封\", \"存在异议\", \"存在居住权\"];\nvar result = {};\nif (table) {\n var rows = table.querySelectorAll(\"tr.el-descriptions-row\");\n rows.forEach(function(row) {\n var label = row.querySelector(\"th.el-descriptions-item__label\").innerText.trim();\n var cOntent= row.querySelector(\"td.el-descriptions-item__content\").innerText.trim();\n if (fields.includes(label)) {\n result[label] = content;\n }\n });\n console.log(JSON.stringify(result));\n} else {\n console.log(\"Table not found.\");\n};\nreturn result;\n"} } }, "processes": "start->1\n1(no)->10->11\n11(no)->12->13\n13(yes)->10\n11(yes)->14->15->16->17->18->20->21->30->31->32->33->34->35->40->41->42->43->44->45->end\n1(yes)->20", "result":["screenshot", "data"] }