diff --git a/.gitignore b/.gitignore index 7360c09..1197550 100644 --- a/.gitignore +++ b/.gitignore @@ -28,4 +28,5 @@ Thumbs.db *.log *.epub -.fake \ No newline at end of file +.fake +src/NexusReader.Web.New/nexus.db diff --git a/src/NexusReader.Infrastructure/Services/EpubService.cs b/src/NexusReader.Infrastructure/Services/EpubService.cs index 6a60222..68774dd 100644 --- a/src/NexusReader.Infrastructure/Services/EpubService.cs +++ b/src/NexusReader.Infrastructure/Services/EpubService.cs @@ -96,20 +96,23 @@ public class EpubService : IEpubService private List ExtractParagraphs(string html) { + var bodyMatch = Regex.Match(html, @"]*>(.*?)", RegexOptions.IgnoreCase | RegexOptions.Singleline); + var content = bodyMatch.Success ? bodyMatch.Groups[1].Value : html; + var paragraphs = new List(); - // Match

tags and their content - var matches = Regex.Matches(html, @"]*>(.*?)

", RegexOptions.IgnoreCase | RegexOptions.Singleline); + // Match block-level elements: h1-h6, p, ul, ol, blockquote, pre + // We match the whole tag to preserve it for sanitization + var matches = Regex.Matches(content, @"<(p|h[1-6]|ul|ol|blockquote|pre)\b[^>]*>.*?|]*>", RegexOptions.IgnoreCase | RegexOptions.Singleline); + foreach (Match match in matches) { - paragraphs.Add(match.Groups[1].Value); + paragraphs.Add(match.Value); } - // Fallback: split by double newlines if no

tags found + // Fallback: split by double newlines if no block tags found if (paragraphs.Count == 0) { - var bodyMatch = Regex.Match(html, @"]*>(.*?)", RegexOptions.IgnoreCase | RegexOptions.Singleline); - var content = bodyMatch.Success ? bodyMatch.Groups[1].Value : html; - paragraphs = content.Split(new[] { "
", "
", "\n\n" }, StringSplitOptions.RemoveEmptyEntries).ToList(); + paragraphs = content.Split(new[] { "
", "
", "\n\n", "\r\n\r\n" }, StringSplitOptions.RemoveEmptyEntries).ToList(); } return paragraphs; @@ -120,11 +123,11 @@ public class EpubService : IEpubService // 1. Remove