refactor: enhance epub parsing to support additional block-level elements and ignore database file in git
This commit is contained in:
@@ -29,3 +29,4 @@ Thumbs.db
|
||||
*.epub
|
||||
|
||||
.fake
|
||||
src/NexusReader.Web.New/nexus.db
|
||||
|
||||
@@ -96,20 +96,23 @@ public class EpubService : IEpubService
|
||||
|
||||
private List<string> ExtractParagraphs(string html)
|
||||
{
|
||||
var bodyMatch = Regex.Match(html, @"<body\b[^>]*>(.*?)</body>", RegexOptions.IgnoreCase | RegexOptions.Singleline);
|
||||
var content = bodyMatch.Success ? bodyMatch.Groups[1].Value : html;
|
||||
|
||||
var paragraphs = new List<string>();
|
||||
// Match <p> tags and their content
|
||||
var matches = Regex.Matches(html, @"<p\b[^>]*>(.*?)</p>", RegexOptions.IgnoreCase | RegexOptions.Singleline);
|
||||
// Match block-level elements: h1-h6, p, ul, ol, blockquote, pre
|
||||
// We match the whole tag to preserve it for sanitization
|
||||
var matches = Regex.Matches(content, @"<(p|h[1-6]|ul|ol|blockquote|pre)\b[^>]*>.*?</\1>|<hr\b[^>]*>", RegexOptions.IgnoreCase | RegexOptions.Singleline);
|
||||
|
||||
foreach (Match match in matches)
|
||||
{
|
||||
paragraphs.Add(match.Groups[1].Value);
|
||||
paragraphs.Add(match.Value);
|
||||
}
|
||||
|
||||
// Fallback: split by double newlines if no <p> tags found
|
||||
// Fallback: split by double newlines if no block tags found
|
||||
if (paragraphs.Count == 0)
|
||||
{
|
||||
var bodyMatch = Regex.Match(html, @"<body\b[^>]*>(.*?)</body>", RegexOptions.IgnoreCase | RegexOptions.Singleline);
|
||||
var content = bodyMatch.Success ? bodyMatch.Groups[1].Value : html;
|
||||
paragraphs = content.Split(new[] { "<br />", "<br>", "\n\n" }, StringSplitOptions.RemoveEmptyEntries).ToList();
|
||||
paragraphs = content.Split(new[] { "<br />", "<br>", "\n\n", "\r\n\r\n" }, StringSplitOptions.RemoveEmptyEntries).ToList();
|
||||
}
|
||||
|
||||
return paragraphs;
|
||||
@@ -120,11 +123,11 @@ public class EpubService : IEpubService
|
||||
// 1. Remove <style> and <script> blocks
|
||||
var clean = Regex.Replace(html, @"<(style|script)\b[^>]*>.*?</\1>", "", RegexOptions.IgnoreCase | RegexOptions.Singleline);
|
||||
|
||||
// 2. Remove all tags except <b>, <i>, <strong>, <em>
|
||||
clean = Regex.Replace(clean, @"<(?!/?(b|i|strong|em)\b)[^>]+>", "", RegexOptions.IgnoreCase);
|
||||
// 2. Remove all tags except allowed structural and formatting tags
|
||||
clean = Regex.Replace(clean, @"<(?!/?(b|i|strong|em|h[1-6]|p|ul|ol|li|blockquote|pre|code|br|hr)\b)[^>]+>", "", RegexOptions.IgnoreCase);
|
||||
|
||||
// 3. Requirement: Aggressively strip attributes (class, style, id) from allowed tags
|
||||
clean = Regex.Replace(clean, @"<(b|i|strong|em)\b[^>]*>", "<$1>", RegexOptions.IgnoreCase);
|
||||
clean = Regex.Replace(clean, @"<(b|i|strong|em|h[1-6]|p|ul|ol|li|blockquote|pre|code|br|hr)\b[^>]*>", "<$1>", RegexOptions.IgnoreCase);
|
||||
|
||||
// 4. Decode HTML entities
|
||||
clean = System.Net.WebUtility.HtmlDecode(clean);
|
||||
|
||||
Reference in New Issue
Block a user