fix: preserve and render EPUB images via dynamic server endpoint (#65)
Fixes #64 ### Summary of Changes 1. **Extended `IEpubReader` & `EpubReaderService`**: Added `GetEpubResourceAsync` to handle binary data extraction of static assets (like images) from the EPUB archive. 2. **Added Client-Side HTTP Call**: Extended `WasmEpubService` to retrieve static resources from the server using the API client. 3. **Preserved and Sanitized Images**: Updated `ExtractParagraphs` and `SanitizeParagraph` to treat `<img>` tags as first-class citizens, preserving their `src` attributes and excluding them from sanitization stripping. 4. **Dynamic URL Rewriting**: Introduced a relative-to-absolute path resolution algorithm (`ResolveRelativePath`) and rewrote image `src` attributes to use the dynamic endpoint `/api/epub/{ebookId}/resource?path=...`. 5. **Registered API Resource Serving Endpoint**: Added the `/api/epub/{ebookId:guid}/resource` minimal API endpoint in `Program.cs` that maps requests directly to `GetEpubResourceAsync` and returns files with the correct MIME type. 6. **Added Unit Tests**: Created `EpubReaderServiceTests.cs` to verify all image extraction, path resolution, and sanitization/rewriting rules. All tests pass successfully. --------- Co-authored-by: Marek Jasiński <jasins.marek@gmail.com> Reviewed-on: #65 Co-authored-by: Antigravity <antigravity@google.com> Co-committed-by: Antigravity <antigravity@google.com>
This commit was merged in pull request #65.
This commit is contained in:
@@ -18,6 +18,16 @@ public class EpubReaderService : IEpubReader
|
||||
private readonly ILogger<EpubReaderService> _logger;
|
||||
private const int WordThreshold = 1000;
|
||||
|
||||
private static readonly Regex ImageTagRegex = new(@"<img\b(?<before>[^>]*?\bsrc=[""'])(?<src>[^""']*?)(?<after>[""'][^>]*?>)", RegexOptions.IgnoreCase | RegexOptions.Compiled);
|
||||
private static readonly Regex BodyMatchRegex = new(@"<body\b[^>]*>(.*?)</body>", RegexOptions.IgnoreCase | RegexOptions.Singleline | RegexOptions.Compiled);
|
||||
private static readonly Regex ParagraphMatchRegex = new(@"<(p|h[1-6]|ul|ol|blockquote|pre)\b[^>]*>.*?</\1>|<hr\b[^>]*>|<img\b[^>]*>", RegexOptions.IgnoreCase | RegexOptions.Singleline | RegexOptions.Compiled);
|
||||
private static readonly Regex StyleScriptRegex = new(@"<(style|script)\b[^>]*>.*?</\1>", RegexOptions.IgnoreCase | RegexOptions.Singleline | RegexOptions.Compiled);
|
||||
private static readonly Regex WhitelistTagsRegex = new(@"<(?!/?(b|i|strong|em|h[1-6]|p|ul|ol|li|blockquote|pre|code|br|hr|img)\b)[^>]+>", RegexOptions.IgnoreCase | RegexOptions.Compiled);
|
||||
private static readonly Regex StripAttributesRegex = new(@"<(b|i|strong|em|h[1-6]|p|ul|ol|li|blockquote|pre|code|br|hr)\b[^>]*>", RegexOptions.IgnoreCase | RegexOptions.Compiled);
|
||||
private static readonly Regex ImgTagSanitizerRegex = new(@"<img\b[^>]*>", RegexOptions.IgnoreCase | RegexOptions.Compiled);
|
||||
private static readonly Regex SrcAttributeRegex = new(@"\bsrc=[""'](?<src>[^""']*)[""']", RegexOptions.IgnoreCase | RegexOptions.Compiled);
|
||||
private static readonly Regex AltAttributeRegex = new(@"\balt=[""'](?<alt>[^""']*)[""']", RegexOptions.IgnoreCase | RegexOptions.Compiled);
|
||||
|
||||
public EpubReaderService(
|
||||
IDbContextFactory<AppDbContext> dbContextFactory,
|
||||
ILogger<EpubReaderService> logger)
|
||||
@@ -80,6 +90,9 @@ public class EpubReaderService : IEpubReader
|
||||
|
||||
var chapterContent = await chapterRef.ReadContentAsTextAsync();
|
||||
|
||||
// Rewrite relative image src URLs to use the server-side API endpoint
|
||||
chapterContent = RewriteImageUrls(chapterContent, ebookId, chapterRef.FilePath);
|
||||
|
||||
// 3. Build content blocks
|
||||
var blocks = new List<ContentBlock>();
|
||||
int totalWordCount = 0;
|
||||
@@ -142,13 +155,150 @@ public class EpubReaderService : IEpubReader
|
||||
return null;
|
||||
}
|
||||
|
||||
/// <inheritdoc />
|
||||
public async Task<Result<byte[]>> GetEpubResourceAsync(
|
||||
Guid ebookId,
|
||||
string resourcePath,
|
||||
string? userId = null,
|
||||
CancellationToken cancellationToken = default)
|
||||
{
|
||||
try
|
||||
{
|
||||
using var context = await _dbContextFactory.CreateDbContextAsync(cancellationToken);
|
||||
var ebook = await context.Ebooks
|
||||
.AsNoTracking()
|
||||
.FirstOrDefaultAsync(
|
||||
e => e.Id == ebookId && (userId == null || e.UserId == userId),
|
||||
cancellationToken);
|
||||
|
||||
if (ebook == null)
|
||||
{
|
||||
return Result.Fail($"Ebook '{ebookId}' not found.");
|
||||
}
|
||||
|
||||
var fullPath = ResolvePath(ebook.FilePath);
|
||||
if (fullPath == null || !File.Exists(fullPath))
|
||||
{
|
||||
return Result.Fail("EPUB file not found.");
|
||||
}
|
||||
|
||||
using var bookRef = await EpubReader.OpenBookAsync(fullPath);
|
||||
|
||||
var decodedPath = System.Net.WebUtility.UrlDecode(resourcePath);
|
||||
if (decodedPath.Contains("..") || decodedPath.Contains(":") || decodedPath.StartsWith("/") || decodedPath.StartsWith("\\"))
|
||||
{
|
||||
return Result.Fail("Invalid resource path.");
|
||||
}
|
||||
|
||||
decodedPath = decodedPath.Replace('\\', '/').TrimStart('/');
|
||||
|
||||
EpubLocalContentFileRef? targetFile = null;
|
||||
if (bookRef.Content?.AllFiles?.Local != null)
|
||||
{
|
||||
foreach (var file in bookRef.Content.AllFiles.Local)
|
||||
{
|
||||
var filePath = file.FilePath?.Replace('\\', '/').TrimStart('/') ?? "";
|
||||
var fileKey = file.Key?.Replace('\\', '/').TrimStart('/') ?? "";
|
||||
if (filePath.Equals(decodedPath, StringComparison.OrdinalIgnoreCase) ||
|
||||
fileKey.Equals(decodedPath, StringComparison.OrdinalIgnoreCase))
|
||||
{
|
||||
targetFile = file;
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (targetFile != null)
|
||||
{
|
||||
if (targetFile is EpubLocalByteContentFileRef byteFile)
|
||||
{
|
||||
byte[] bytes = await byteFile.ReadContentAsync();
|
||||
return Result.Ok(bytes);
|
||||
}
|
||||
else if (targetFile is EpubLocalTextContentFileRef textFile)
|
||||
{
|
||||
string text = await textFile.ReadContentAsync();
|
||||
byte[] bytes = System.Text.Encoding.UTF8.GetBytes(text);
|
||||
return Result.Ok(bytes);
|
||||
}
|
||||
}
|
||||
|
||||
return Result.Fail($"Resource '{resourcePath}' not found in EPUB.");
|
||||
}
|
||||
catch (Exception ex)
|
||||
{
|
||||
_logger.LogError(ex, "Failed to retrieve EPUB resource '{ResourcePath}' for ebook {EbookId}.", resourcePath, ebookId);
|
||||
return Result.Fail(new Error($"Failed to retrieve EPUB resource: {ex.Message}").CausedBy(ex));
|
||||
}
|
||||
}
|
||||
|
||||
private static string RewriteImageUrls(string html, Guid ebookId, string chapterPath)
|
||||
{
|
||||
if (string.IsNullOrEmpty(html)) return html;
|
||||
|
||||
return ImageTagRegex.Replace(html, match =>
|
||||
{
|
||||
var rawSrc = match.Groups["src"].Value;
|
||||
|
||||
if (rawSrc.StartsWith("javascript:", StringComparison.OrdinalIgnoreCase))
|
||||
{
|
||||
return ""; // Completely block script execution in image src
|
||||
}
|
||||
|
||||
if (rawSrc.StartsWith("http://", StringComparison.OrdinalIgnoreCase) ||
|
||||
rawSrc.StartsWith("https://", StringComparison.OrdinalIgnoreCase) ||
|
||||
rawSrc.StartsWith("data:", StringComparison.OrdinalIgnoreCase))
|
||||
{
|
||||
return match.Value;
|
||||
}
|
||||
|
||||
var resolvedPath = ResolveRelativePath(chapterPath, rawSrc);
|
||||
var rewrittenSrc = $"/api/epub/{ebookId}/resource?path={System.Net.WebUtility.UrlEncode(resolvedPath)}";
|
||||
return $"{match.Groups["before"].Value}{rewrittenSrc}{match.Groups["after"].Value}";
|
||||
});
|
||||
}
|
||||
|
||||
private static string ResolveRelativePath(string basePath, string relativePath)
|
||||
{
|
||||
if (string.IsNullOrEmpty(relativePath)) return string.Empty;
|
||||
|
||||
var decodedRelative = System.Net.WebUtility.UrlDecode(relativePath);
|
||||
var baseDir = Path.GetDirectoryName(basePath) ?? "";
|
||||
baseDir = baseDir.Replace('\\', '/');
|
||||
|
||||
var combined = Path.Combine(baseDir, decodedRelative).Replace('\\', '/');
|
||||
var segments = combined.Split('/');
|
||||
var stack = new Stack<string>();
|
||||
|
||||
foreach (var segment in segments)
|
||||
{
|
||||
if (segment == "." || string.IsNullOrEmpty(segment))
|
||||
{
|
||||
continue;
|
||||
}
|
||||
if (segment == "..")
|
||||
{
|
||||
if (stack.Count > 0)
|
||||
{
|
||||
stack.Pop();
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
stack.Push(segment);
|
||||
}
|
||||
}
|
||||
|
||||
return string.Join("/", stack.Reverse());
|
||||
}
|
||||
|
||||
private static List<string> ExtractParagraphs(string html)
|
||||
{
|
||||
var bodyMatch = Regex.Match(html, @"<body\b[^>]*>(.*?)</body>", RegexOptions.IgnoreCase | RegexOptions.Singleline);
|
||||
var bodyMatch = BodyMatchRegex.Match(html);
|
||||
var content = bodyMatch.Success ? bodyMatch.Groups[1].Value : html;
|
||||
|
||||
var paragraphs = new List<string>();
|
||||
var matches = Regex.Matches(content, @"<(p|h[1-6]|ul|ol|blockquote|pre)\b[^>]*>.*?</\1>|<hr\b[^>]*>", RegexOptions.IgnoreCase | RegexOptions.Singleline);
|
||||
var matches = ParagraphMatchRegex.Matches(content);
|
||||
|
||||
foreach (Match match in matches)
|
||||
{
|
||||
@@ -165,9 +315,20 @@ public class EpubReaderService : IEpubReader
|
||||
|
||||
private static string SanitizeParagraph(string html)
|
||||
{
|
||||
var clean = Regex.Replace(html, @"<(style|script)\b[^>]*>.*?</\1>", "", RegexOptions.IgnoreCase | RegexOptions.Singleline);
|
||||
clean = Regex.Replace(clean, @"<(?!/?(b|i|strong|em|h[1-6]|p|ul|ol|li|blockquote|pre|code|br|hr)\b)[^>]+>", "", RegexOptions.IgnoreCase);
|
||||
clean = Regex.Replace(clean, @"<(b|i|strong|em|h[1-6]|p|ul|ol|li|blockquote|pre|code|br|hr)\b[^>]*>", "<$1>", RegexOptions.IgnoreCase);
|
||||
var clean = StyleScriptRegex.Replace(html, "");
|
||||
clean = WhitelistTagsRegex.Replace(clean, "");
|
||||
clean = StripAttributesRegex.Replace(clean, "<$1>");
|
||||
|
||||
// Securely sanitize img tags by keeping ONLY src and alt attributes to prevent XSS (onerror, onload, style, etc.)
|
||||
clean = ImgTagSanitizerRegex.Replace(clean, m =>
|
||||
{
|
||||
var srcMatch = SrcAttributeRegex.Match(m.Value);
|
||||
var altMatch = AltAttributeRegex.Match(m.Value);
|
||||
var srcAttr = srcMatch.Success ? $" src=\"{srcMatch.Groups["src"].Value}\"" : "";
|
||||
var altAttr = altMatch.Success ? $" alt=\"{altMatch.Groups["alt"].Value}\"" : "";
|
||||
return $"<img{srcAttr}{altAttr} />";
|
||||
});
|
||||
|
||||
clean = System.Net.WebUtility.HtmlDecode(clean);
|
||||
return clean.Trim();
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user