| 1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192 |
- /**
- * 清洗小说内容,移除原始网站信息
- * @param {string} content - 原始内容
- * @returns {string} 清洗后的内容
- */
- export function cleanNovelContent(content) {
- if (!content) return ''
-
- // 移除特定网站信息
- const cleanContent = content
- .replace(/最新网址[::]?\s*[a-z0-9.-]+/gi, '')
- .replace(/www\.[a-z0-9]+\.[a-z]{2,}/gi, '')
- .replace(/请收藏本站:https:\/\/www\.\w+\.\w+/g, '')
- .replace(/ | /g, ' ') // 替换空格实体
- .replace(/<br\s*\/?>/g, '\n') // 替换换行标签
- .replace(/<[^>]+>/g, '') // 移除所有HTML标签
-
- // 移除多余空行
- return cleanContent
- .split('\n')
- .map(line => line.trim())
- .filter(line => line.length > 0)
- .join('\n\n')
- }
-
- /**
- * 分页处理小说内容
- * @param {string} content - 清洗后的内容
- * @param {number} pageSize - 每页字符数
- * @returns {string[]} 分页后的内容数组
- */
- export function paginateContent(content, pageSize = 800) {
- const pages = []
- let currentPage = ''
- let currentLength = 0
- const paragraphs = content.split('\n\n')
-
- for (const paragraph of paragraphs) {
- // 如果当前页加上新段落不会超长
- if (currentLength + paragraph.length <= pageSize) {
- currentPage += (currentPage ? '\n\n' : '') + paragraph
- currentLength += paragraph.length
- }
- // 如果段落本身超过一页
- else if (paragraph.length > pageSize) {
- // 先保存当前页
- if (currentPage) {
- pages.push(currentPage)
- currentPage = ''
- currentLength = 0
- }
-
- // 将长段落分割成多页
- let start = 0
- while (start < paragraph.length) {
- const end = start + pageSize
- let pageContent = paragraph.substring(start, end)
-
- // 尽量在句号处分页
- const lastPunctuation = Math.max(
- pageContent.lastIndexOf('。'),
- pageContent.lastIndexOf('!'),
- pageContent.lastIndexOf('?'),
- pageContent.lastIndexOf('.'),
- pageContent.lastIndexOf('!'),
- pageContent.lastIndexOf('?')
- )
-
- if (lastPunctuation > -1 && lastPunctuation > start + pageSize * 0.8) {
- pageContent = pageContent.substring(0, lastPunctuation + 1)
- start = start + lastPunctuation + 1
- } else {
- start = end
- }
-
- pages.push(pageContent)
- }
- }
- // 如果段落会导致当前页超长
- else {
- pages.push(currentPage)
- currentPage = paragraph
- currentLength = paragraph.length
- }
- }
-
- if (currentPage) {
- pages.push(currentPage)
- }
-
- return pages
- }
|