/** * 清洗小说内容,移除原始网站信息 * @param {string} content - 原始内容 * @returns {string} 清洗后的内容 */ export function cleanNovelContent(content) { if (!content) return '' // 移除特定网站信息 const cleanContent = content .replace(/最新网址[::]?\s*[a-z0-9.-]+/gi, '') .replace(/www\.[a-z0-9]+\.[a-z]{2,}/gi, '') .replace(/请收藏本站:https:\/\/www\.\w+\.\w+/g, '') .replace(/ | /g, ' ') // 替换空格实体 .replace(//g, '\n') // 替换换行标签 .replace(/<[^>]+>/g, '') // 移除所有HTML标签 // 移除多余空行 return cleanContent .split('\n') .map(line => line.trim()) .filter(line => line.length > 0) .join('\n\n') } /** * 分页处理小说内容 * @param {string} content - 清洗后的内容 * @param {number} pageSize - 每页字符数 * @returns {string[]} 分页后的内容数组 */ export function paginateContent(content, pageSize = 800) { const pages = [] let currentPage = '' let currentLength = 0 const paragraphs = content.split('\n\n') for (const paragraph of paragraphs) { // 如果当前页加上新段落不会超长 if (currentLength + paragraph.length <= pageSize) { currentPage += (currentPage ? '\n\n' : '') + paragraph currentLength += paragraph.length } // 如果段落本身超过一页 else if (paragraph.length > pageSize) { // 先保存当前页 if (currentPage) { pages.push(currentPage) currentPage = '' currentLength = 0 } // 将长段落分割成多页 let start = 0 while (start < paragraph.length) { const end = start + pageSize let pageContent = paragraph.substring(start, end) // 尽量在句号处分页 const lastPunctuation = Math.max( pageContent.lastIndexOf('。'), pageContent.lastIndexOf('!'), pageContent.lastIndexOf('?'), pageContent.lastIndexOf('.'), pageContent.lastIndexOf('!'), pageContent.lastIndexOf('?') ) if (lastPunctuation > -1 && lastPunctuation > start + pageSize * 0.8) { pageContent = pageContent.substring(0, lastPunctuation + 1) start = start + lastPunctuation + 1 } else { start = end } pages.push(pageContent) } } // 如果段落会导致当前页超长 else { pages.push(currentPage) currentPage = paragraph currentLength = paragraph.length } } if (currentPage) { pages.push(currentPage) } return pages }