refactor: enhance epub parsing to support additional block-level elements and ignore database file in git
This commit is contained in:
@@ -29,3 +29,4 @@ Thumbs.db
|
|||||||
*.epub
|
*.epub
|
||||||
|
|
||||||
.fake
|
.fake
|
||||||
|
src/NexusReader.Web.New/nexus.db
|
||||||
|
|||||||
@@ -96,20 +96,23 @@ public class EpubService : IEpubService
|
|||||||
|
|
||||||
private List<string> ExtractParagraphs(string html)
|
private List<string> ExtractParagraphs(string html)
|
||||||
{
|
{
|
||||||
|
var bodyMatch = Regex.Match(html, @"<body\b[^>]*>(.*?)</body>", RegexOptions.IgnoreCase | RegexOptions.Singleline);
|
||||||
|
var content = bodyMatch.Success ? bodyMatch.Groups[1].Value : html;
|
||||||
|
|
||||||
var paragraphs = new List<string>();
|
var paragraphs = new List<string>();
|
||||||
// Match <p> tags and their content
|
// Match block-level elements: h1-h6, p, ul, ol, blockquote, pre
|
||||||
var matches = Regex.Matches(html, @"<p\b[^>]*>(.*?)</p>", RegexOptions.IgnoreCase | RegexOptions.Singleline);
|
// We match the whole tag to preserve it for sanitization
|
||||||
|
var matches = Regex.Matches(content, @"<(p|h[1-6]|ul|ol|blockquote|pre)\b[^>]*>.*?</\1>|<hr\b[^>]*>", RegexOptions.IgnoreCase | RegexOptions.Singleline);
|
||||||
|
|
||||||
foreach (Match match in matches)
|
foreach (Match match in matches)
|
||||||
{
|
{
|
||||||
paragraphs.Add(match.Groups[1].Value);
|
paragraphs.Add(match.Value);
|
||||||
}
|
}
|
||||||
|
|
||||||
// Fallback: split by double newlines if no <p> tags found
|
// Fallback: split by double newlines if no block tags found
|
||||||
if (paragraphs.Count == 0)
|
if (paragraphs.Count == 0)
|
||||||
{
|
{
|
||||||
var bodyMatch = Regex.Match(html, @"<body\b[^>]*>(.*?)</body>", RegexOptions.IgnoreCase | RegexOptions.Singleline);
|
paragraphs = content.Split(new[] { "<br />", "<br>", "\n\n", "\r\n\r\n" }, StringSplitOptions.RemoveEmptyEntries).ToList();
|
||||||
var content = bodyMatch.Success ? bodyMatch.Groups[1].Value : html;
|
|
||||||
paragraphs = content.Split(new[] { "<br />", "<br>", "\n\n" }, StringSplitOptions.RemoveEmptyEntries).ToList();
|
|
||||||
}
|
}
|
||||||
|
|
||||||
return paragraphs;
|
return paragraphs;
|
||||||
@@ -120,11 +123,11 @@ public class EpubService : IEpubService
|
|||||||
// 1. Remove <style> and <script> blocks
|
// 1. Remove <style> and <script> blocks
|
||||||
var clean = Regex.Replace(html, @"<(style|script)\b[^>]*>.*?</\1>", "", RegexOptions.IgnoreCase | RegexOptions.Singleline);
|
var clean = Regex.Replace(html, @"<(style|script)\b[^>]*>.*?</\1>", "", RegexOptions.IgnoreCase | RegexOptions.Singleline);
|
||||||
|
|
||||||
// 2. Remove all tags except <b>, <i>, <strong>, <em>
|
// 2. Remove all tags except allowed structural and formatting tags
|
||||||
clean = Regex.Replace(clean, @"<(?!/?(b|i|strong|em)\b)[^>]+>", "", RegexOptions.IgnoreCase);
|
clean = Regex.Replace(clean, @"<(?!/?(b|i|strong|em|h[1-6]|p|ul|ol|li|blockquote|pre|code|br|hr)\b)[^>]+>", "", RegexOptions.IgnoreCase);
|
||||||
|
|
||||||
// 3. Requirement: Aggressively strip attributes (class, style, id) from allowed tags
|
// 3. Requirement: Aggressively strip attributes (class, style, id) from allowed tags
|
||||||
clean = Regex.Replace(clean, @"<(b|i|strong|em)\b[^>]*>", "<$1>", RegexOptions.IgnoreCase);
|
clean = Regex.Replace(clean, @"<(b|i|strong|em|h[1-6]|p|ul|ol|li|blockquote|pre|code|br|hr)\b[^>]*>", "<$1>", RegexOptions.IgnoreCase);
|
||||||
|
|
||||||
// 4. Decode HTML entities
|
// 4. Decode HTML entities
|
||||||
clean = System.Net.WebUtility.HtmlDecode(clean);
|
clean = System.Net.WebUtility.HtmlDecode(clean);
|
||||||
|
|||||||
Reference in New Issue
Block a user