refactor: enhance epub parsing to support additional block-level elements and ignore database file in git

This commit is contained in:
2026-04-26 15:27:57 +02:00
parent 7859c9806f
commit 82d726097f
2 changed files with 15 additions and 11 deletions
@@ -96,20 +96,23 @@ public class EpubService : IEpubService
private List<string> ExtractParagraphs(string html)
{
var bodyMatch = Regex.Match(html, @"<body\b[^>]*>(.*?)</body>", RegexOptions.IgnoreCase | RegexOptions.Singleline);
var content = bodyMatch.Success ? bodyMatch.Groups[1].Value : html;
var paragraphs = new List<string>();
// Match <p> tags and their content
var matches = Regex.Matches(html, @"<p\b[^>]*>(.*?)</p>", RegexOptions.IgnoreCase | RegexOptions.Singleline);
// Match block-level elements: h1-h6, p, ul, ol, blockquote, pre
// We match the whole tag to preserve it for sanitization
var matches = Regex.Matches(content, @"<(p|h[1-6]|ul|ol|blockquote|pre)\b[^>]*>.*?</\1>|<hr\b[^>]*>", RegexOptions.IgnoreCase | RegexOptions.Singleline);
foreach (Match match in matches)
{
paragraphs.Add(match.Groups[1].Value);
paragraphs.Add(match.Value);
}
// Fallback: split by double newlines if no <p> tags found
// Fallback: split by double newlines if no block tags found
if (paragraphs.Count == 0)
{
var bodyMatch = Regex.Match(html, @"<body\b[^>]*>(.*?)</body>", RegexOptions.IgnoreCase | RegexOptions.Singleline);
var content = bodyMatch.Success ? bodyMatch.Groups[1].Value : html;
paragraphs = content.Split(new[] { "<br />", "<br>", "\n\n" }, StringSplitOptions.RemoveEmptyEntries).ToList();
paragraphs = content.Split(new[] { "<br />", "<br>", "\n\n", "\r\n\r\n" }, StringSplitOptions.RemoveEmptyEntries).ToList();
}
return paragraphs;
@@ -120,11 +123,11 @@ public class EpubService : IEpubService
// 1. Remove <style> and <script> blocks
var clean = Regex.Replace(html, @"<(style|script)\b[^>]*>.*?</\1>", "", RegexOptions.IgnoreCase | RegexOptions.Singleline);
// 2. Remove all tags except <b>, <i>, <strong>, <em>
clean = Regex.Replace(clean, @"<(?!/?(b|i|strong|em)\b)[^>]+>", "", RegexOptions.IgnoreCase);
// 2. Remove all tags except allowed structural and formatting tags
clean = Regex.Replace(clean, @"<(?!/?(b|i|strong|em|h[1-6]|p|ul|ol|li|blockquote|pre|code|br|hr)\b)[^>]+>", "", RegexOptions.IgnoreCase);
// 3. Requirement: Aggressively strip attributes (class, style, id) from allowed tags
clean = Regex.Replace(clean, @"<(b|i|strong|em)\b[^>]*>", "<$1>", RegexOptions.IgnoreCase);
clean = Regex.Replace(clean, @"<(b|i|strong|em|h[1-6]|p|ul|ol|li|blockquote|pre|code|br|hr)\b[^>]*>", "<$1>", RegexOptions.IgnoreCase);
// 4. Decode HTML entities
clean = System.Net.WebUtility.HtmlDecode(clean);