当当电子书生成pdf示例

当当电子书生成pdf示例

准备工作

安装 Python 3、PyCharm 社区版

安装 pyautogui 命令如下:(参考:pyautogui安装教程)

pip install pyautogui

或者

pip install pyautogui -i https://mirrors.aliyun.com/pypi/simple

pyautogui 使用参考:

https://blog.csdn.net/ibiao/article/details/77859997

https://jingyan.baidu.com/article/39810a23440b20b636fda621.html

技术实现

python + java项目

第一步:抓取所有书的名录,保存到本地数据库

python 脚本

#!/usr/bin/python3

import sys

import time

import pyautogui

from PIL import ImageGrab

# 标记图像

imgDir = "D:/dang/python/dang/img"

imgLoadingMark = "%s/%s.bmp" % (imgDir, "loadingMark") # “加载中”标记图像

imgLoadFailMark = "%s/%s.bmp" % (imgDir, "loadFailMark") # “加载失败”标记图像

imgLoginPassMark = "%s/%s.bmp" % (imgDir, "loginPassMark") # “登录失效”标记图像

# 参数配置

bookDir = "D:/book/xxxxxxxxx" # 替换书名

bookDir2 = "D:/tmpbmp" # BMP临时目录(用来抓取标记图像)

pageCount = 300 # 页数

pageArea = (354, 72, 1012, 950) # 页面区域(x1, y1, x2, y2)

# 查找标记图像

def marchMark(fileMark):

return pyautogui.locateCenterOnScreen(fileMark, grayscale=True)

# 抓取页面图像

def catchImage(area, file):

im = ImageGrab.grab(bbox=area)

im.save(file)

# 检查标记图像

def checkMark():

if markResult := marchMark(imgLoadingMark):

print("imgLoadingMark", markResult)

time.sleep(3)

return checkMark()

if markResult := marchMark(imgLoadFailMark):

print("imgLoadFailMark", markResult)

pyautogui.click(markResult)

time.sleep(3)

return checkMark()

if markResult := marchMark(imgLoginPassMark):

print("imgLoginPassMark", markResult)

pyautogui.moveTo(markResult)

return False

return True

# 抓取一个页面

def catchPage(i):

if checkMark():

# print("catch", i)

catchImage(pageArea, "%s/%d.jpg" % (bookDir, i))

catchImage(pageArea, "%s/%d.bmp" % (bookDir2, i))

return True

else:

return False

# 从第 i 页开始抓取

def startImpl(i):

while i <= pageCount:

if catchPage(i):

pyautogui.press('right')

# time.sleep(1)

i += 1

else:

break

else:

return True

return False

# 从第 i 页开始抓取

def start(i):

if startImpl(i):

print("catch success")

else:

print("catch fail")

# 抓取标记图像

def catchMark(area, fileMark):

catchImage(area, fileMark)

markResult = marchMark(fileMark)

if markResult:

print("markResult", markResult)

pyautogui.moveTo(markResult)

print("ready...")

time.sleep(3)

print("go")

start(1)

# catchPage(1948)

# while True: pyautogui.hotkey('ctrl', 'end') # 下翻操作

# 抓取标记图像

# catchMark((430, 558, 523, 558 + 1), imgLoadingMark)

# catchMark((358, 448, 440, 448 + 1), imgLoadFailMark)

# catchMark((666, 510, 856, 510 + 1), imgLoginPassMark)

# im = pyautogui.screenshot(bookDir + "/1.jpg", region=(0, 0, 300, 400))

进入当当电子书的某个分类,例如“自然科学”,运行脚本中的“下翻操作”,翻开所有书目后,手动保存DOM内容为“自然科学.txt”

java项目

pom.xml

com.alibaba

fastjson

1.2.47

com.squareup.okhttp3

okhttp

4.7.2

mysql

mysql-connector-java

8.0.21

org.apache.pdfbox

pdfbox

2.0.11

org.apache.pdfbox

fontbox

2.0.11

org.jsoup

jsoup

1.13.1

CatchBookTable.java 构建书目记录,方便查找和管理

package com.xnktyu.dangdang;

import com.alibaba.fastjson.JSONArray;

import com.alibaba.fastjson.JSONObject;

import com.xnktyu.utils.*;

import org.jsoup.Jsoup;

import org.jsoup.nodes.Document;

import org.jsoup.nodes.Element;

import org.jsoup.select.Elements;

import java.io.File;

import java.io.FileFilter;

import java.util.HashMap;

import java.util.Map;

public class CatchBookTable

{

private static final DBHelper local_db = new DBHelper("localhost", "root", "xxxxxxx", "dang");

private static class tbase

{

protected String pack(String field)

{

return "f_" + field.toLowerCase();

}

@Override

public String toString()

{

return getClass().getSimpleName().toLowerCase();

}

}

private static final class t_book extends tbase

{

public final String bookId = pac

相关创作