refactor: enhance epub parsing to support additional block-level elements and ignore database file in git

2026-04-26 15:27:57 +02:00
parent 7859c9806f
commit 82d726097f
2 changed files with 15 additions and 11 deletions
@@ -29,3 +29,4 @@ Thumbs.db
 *.epub
 .fake
 src/NexusReader.Web.New/nexus.db
@@ -96,20 +96,23 @@ public class EpubService : IEpubService
    private List<string> ExtractParagraphs(string html)
    {
        var bodyMatch = Regex.Match(html, @"<body\b[^>]*>(.*?)</body>", RegexOptions.IgnoreCase | RegexOptions.Singleline);
        var content = bodyMatch.Success ? bodyMatch.Groups[1].Value : html;
        var paragraphs = new List<string>();
-        // Match <p> tags and their content
+        // Match block-level elements: h1-h6, p, ul, ol, blockquote, pre
-        var matches = Regex.Matches(html, @"<p\b[^>]*>(.*?)</p>", RegexOptions.IgnoreCase | RegexOptions.Singleline);
+        // We match the whole tag to preserve it for sanitization
        var matches = Regex.Matches(content, @"<(p|h[1-6]|ul|ol|blockquote|pre)\b[^>]*>.*?</\1>|<hr\b[^>]*>", RegexOptions.IgnoreCase | RegexOptions.Singleline);
        foreach (Match match in matches)
        {
-            paragraphs.Add(match.Groups[1].Value);
+            paragraphs.Add(match.Value);
        }
-        // Fallback: split by double newlines if no <p> tags found
+        // Fallback: split by double newlines if no block tags found
        if (paragraphs.Count == 0)
        {
-            var bodyMatch = Regex.Match(html, @"<body\b[^>]*>(.*?)</body>", RegexOptions.IgnoreCase | RegexOptions.Singleline);
+            paragraphs = content.Split(new[] { "<br />", "<br>", "\n\n", "\r\n\r\n" }, StringSplitOptions.RemoveEmptyEntries).ToList();
            var content = bodyMatch.Success ? bodyMatch.Groups[1].Value : html;
            paragraphs = content.Split(new[] { "<br />", "<br>", "\n\n" }, StringSplitOptions.RemoveEmptyEntries).ToList();
        }
        return paragraphs;
@@ -120,11 +123,11 @@ public class EpubService : IEpubService
        // 1. Remove <style> and <script> blocks
        var clean = Regex.Replace(html, @"<(style|script)\b[^>]*>.*?</\1>", "", RegexOptions.IgnoreCase | RegexOptions.Singleline);
-        // 2. Remove all tags except <b>, <i>, <strong>, <em>
+        // 2. Remove all tags except allowed structural and formatting tags
-        clean = Regex.Replace(clean, @"<(?!/?(b|i|strong|em)\b)[^>]+>", "", RegexOptions.IgnoreCase);
+        clean = Regex.Replace(clean, @"<(?!/?(b|i|strong|em|h[1-6]|p|ul|ol|li|blockquote|pre|code|br|hr)\b)[^>]+>", "", RegexOptions.IgnoreCase);
        // 3. Requirement: Aggressively strip attributes (class, style, id) from allowed tags
-        clean = Regex.Replace(clean, @"<(b|i|strong|em)\b[^>]*>", "<$1>", RegexOptions.IgnoreCase);
+        clean = Regex.Replace(clean, @"<(b|i|strong|em|h[1-6]|p|ul|ol|li|blockquote|pre|code|br|hr)\b[^>]*>", "<$1>", RegexOptions.IgnoreCase);
        // 4. Decode HTML entities
        clean = System.Net.WebUtility.HtmlDecode(clean);