fix: preserve and render EPUB images via dynamic server endpoint (#65)

Fixes #64 ### Summary of Changes 1. **Extended `IEpubReader` & `EpubReaderService`**: Added `GetEpubResourceAsync` to handle binary data extraction of static assets (like images) from the EPUB archive. 2. **Added Client-Side HTTP Call**: Extended `WasmEpubService` to retrieve static resources from the server using the API client. 3. **Preserved and Sanitized Images**: Updated `ExtractParagraphs` and `SanitizeParagraph` to treat `<img>` tags as first-class citizens, preserving their `src` attributes and excluding them from sanitization stripping. 4. **Dynamic URL Rewriting**: Introduced a relative-to-absolute path resolution algorithm (`ResolveRelativePath`) and rewrote image `src` attributes to use the dynamic endpoint `/api/epub/{ebookId}/resource?path=...`. 5. **Registered API Resource Serving Endpoint**: Added the `/api/epub/{ebookId:guid}/resource` minimal API endpoint in `Program.cs` that maps requests directly to `GetEpubResourceAsync` and returns files with the correct MIME type. 6. **Added Unit Tests**: Created `EpubReaderServiceTests.cs` to verify all image extraction, path resolution, and sanitization/rewriting rules. All tests pass successfully. --------- Co-authored-by: Marek Jasiński <jasins.marek@gmail.com> Reviewed-on: #65 Co-authored-by: Antigravity <antigravity@google.com> Co-committed-by: Antigravity <antigravity@google.com>
2026-06-01 16:04:56 +00:00
parent 21c9a66cce
commit bf31effd36
12 changed files with 554 additions and 11 deletions
@@ -18,6 +18,16 @@ public class EpubReaderService : IEpubReader
    private readonly ILogger<EpubReaderService> _logger;
    private const int WordThreshold = 1000;

+    private static readonly Regex ImageTagRegex = new(@"<img\b(?<before>[^>]*?\bsrc=[""'])(?<src>[^""']*?)(?<after>[""'][^>]*?>)", RegexOptions.IgnoreCase | RegexOptions.Compiled);
+    private static readonly Regex BodyMatchRegex = new(@"<body\b[^>]*>(.*?)</body>", RegexOptions.IgnoreCase | RegexOptions.Singleline | RegexOptions.Compiled);
+    private static readonly Regex ParagraphMatchRegex = new(@"<(p|h[1-6]|ul|ol|blockquote|pre)\b[^>]*>.*?</\1>|<hr\b[^>]*>|<img\b[^>]*>", RegexOptions.IgnoreCase | RegexOptions.Singleline | RegexOptions.Compiled);
+    private static readonly Regex StyleScriptRegex = new(@"<(style|script)\b[^>]*>.*?</\1>", RegexOptions.IgnoreCase | RegexOptions.Singleline | RegexOptions.Compiled);
+    private static readonly Regex WhitelistTagsRegex = new(@"<(?!/?(b|i|strong|em|h[1-6]|p|ul|ol|li|blockquote|pre|code|br|hr|img)\b)[^>]+>", RegexOptions.IgnoreCase | RegexOptions.Compiled);
+    private static readonly Regex StripAttributesRegex = new(@"<(b|i|strong|em|h[1-6]|p|ul|ol|li|blockquote|pre|code|br|hr)\b[^>]*>", RegexOptions.IgnoreCase | RegexOptions.Compiled);
+    private static readonly Regex ImgTagSanitizerRegex = new(@"<img\b[^>]*>", RegexOptions.IgnoreCase | RegexOptions.Compiled);
+    private static readonly Regex SrcAttributeRegex = new(@"\bsrc=[""'](?<src>[^""']*)[""']", RegexOptions.IgnoreCase | RegexOptions.Compiled);
+    private static readonly Regex AltAttributeRegex = new(@"\balt=[""'](?<alt>[^""']*)[""']", RegexOptions.IgnoreCase | RegexOptions.Compiled);
+
    public EpubReaderService(
        IDbContextFactory<AppDbContext> dbContextFactory,
        ILogger<EpubReaderService> logger)
@@ -80,6 +90,9 @@ public class EpubReaderService : IEpubReader

            var chapterContent = await chapterRef.ReadContentAsTextAsync();

+            // Rewrite relative image src URLs to use the server-side API endpoint
+            chapterContent = RewriteImageUrls(chapterContent, ebookId, chapterRef.FilePath);
+
            // 3. Build content blocks
            var blocks = new List<ContentBlock>();
            int totalWordCount = 0;
@@ -142,13 +155,150 @@ public class EpubReaderService : IEpubReader
        return null;
    }

+    /// <inheritdoc />
+    public async Task<Result<byte[]>> GetEpubResourceAsync(
+        Guid ebookId,
+        string resourcePath,
+        string? userId = null,
+        CancellationToken cancellationToken = default)
+    {
+        try
+        {
+            using var context = await _dbContextFactory.CreateDbContextAsync(cancellationToken);
+            var ebook = await context.Ebooks
+                .AsNoTracking()
+                .FirstOrDefaultAsync(
+                    e => e.Id == ebookId && (userId == null || e.UserId == userId),
+                    cancellationToken);
+
+            if (ebook == null)
+            {
+                return Result.Fail($"Ebook '{ebookId}' not found.");
+            }
+
+            var fullPath = ResolvePath(ebook.FilePath);
+            if (fullPath == null || !File.Exists(fullPath))
+            {
+                return Result.Fail("EPUB file not found.");
+            }
+
+            using var bookRef = await EpubReader.OpenBookAsync(fullPath);
+            
+            var decodedPath = System.Net.WebUtility.UrlDecode(resourcePath);
+            if (decodedPath.Contains("..") || decodedPath.Contains(":") || decodedPath.StartsWith("/") || decodedPath.StartsWith("\\"))
+            {
+                return Result.Fail("Invalid resource path.");
+            }
+
+            decodedPath = decodedPath.Replace('\\', '/').TrimStart('/');
+
+            EpubLocalContentFileRef? targetFile = null;
+            if (bookRef.Content?.AllFiles?.Local != null)
+            {
+                foreach (var file in bookRef.Content.AllFiles.Local)
+                {
+                    var filePath = file.FilePath?.Replace('\\', '/').TrimStart('/') ?? "";
+                    var fileKey = file.Key?.Replace('\\', '/').TrimStart('/') ?? "";
+                    if (filePath.Equals(decodedPath, StringComparison.OrdinalIgnoreCase) ||
+                        fileKey.Equals(decodedPath, StringComparison.OrdinalIgnoreCase))
+                    {
+                        targetFile = file;
+                        break;
+                    }
+                }
+            }
+
+            if (targetFile != null)
+            {
+                if (targetFile is EpubLocalByteContentFileRef byteFile)
+                {
+                    byte[] bytes = await byteFile.ReadContentAsync();
+                    return Result.Ok(bytes);
+                }
+                else if (targetFile is EpubLocalTextContentFileRef textFile)
+                {
+                    string text = await textFile.ReadContentAsync();
+                    byte[] bytes = System.Text.Encoding.UTF8.GetBytes(text);
+                    return Result.Ok(bytes);
+                }
+            }
+
+            return Result.Fail($"Resource '{resourcePath}' not found in EPUB.");
+        }
+        catch (Exception ex)
+        {
+            _logger.LogError(ex, "Failed to retrieve EPUB resource '{ResourcePath}' for ebook {EbookId}.", resourcePath, ebookId);
+            return Result.Fail(new Error($"Failed to retrieve EPUB resource: {ex.Message}").CausedBy(ex));
+        }
+    }
+
+    private static string RewriteImageUrls(string html, Guid ebookId, string chapterPath)
+    {
+        if (string.IsNullOrEmpty(html)) return html;
+
+        return ImageTagRegex.Replace(html, match =>
+        {
+            var rawSrc = match.Groups["src"].Value;
+            
+            if (rawSrc.StartsWith("javascript:", StringComparison.OrdinalIgnoreCase))
+            {
+                return ""; // Completely block script execution in image src
+            }
+
+            if (rawSrc.StartsWith("http://", StringComparison.OrdinalIgnoreCase) ||
+                rawSrc.StartsWith("https://", StringComparison.OrdinalIgnoreCase) ||
+                rawSrc.StartsWith("data:", StringComparison.OrdinalIgnoreCase))
+            {
+                return match.Value;
+            }
+
+            var resolvedPath = ResolveRelativePath(chapterPath, rawSrc);
+            var rewrittenSrc = $"/api/epub/{ebookId}/resource?path={System.Net.WebUtility.UrlEncode(resolvedPath)}";
+            return $"{match.Groups["before"].Value}{rewrittenSrc}{match.Groups["after"].Value}";
+        });
+    }
+
+    private static string ResolveRelativePath(string basePath, string relativePath)
+    {
+        if (string.IsNullOrEmpty(relativePath)) return string.Empty;
+
+        var decodedRelative = System.Net.WebUtility.UrlDecode(relativePath);
+        var baseDir = Path.GetDirectoryName(basePath) ?? "";
+        baseDir = baseDir.Replace('\\', '/');
+
+        var combined = Path.Combine(baseDir, decodedRelative).Replace('\\', '/');
+        var segments = combined.Split('/');
+        var stack = new Stack<string>();
+        
+        foreach (var segment in segments)
+        {
+            if (segment == "." || string.IsNullOrEmpty(segment))
+            {
+                continue;
+            }
+            if (segment == "..")
+            {
+                if (stack.Count > 0)
+                {
+                    stack.Pop();
+                }
+            }
+            else
+            {
+                stack.Push(segment);
+            }
+        }
+
+        return string.Join("/", stack.Reverse());
+    }
+
    private static List<string> ExtractParagraphs(string html)
    {
-        var bodyMatch = Regex.Match(html, @"<body\b[^>]*>(.*?)</body>", RegexOptions.IgnoreCase | RegexOptions.Singleline);
+        var bodyMatch = BodyMatchRegex.Match(html);
        var content = bodyMatch.Success ? bodyMatch.Groups[1].Value : html;

        var paragraphs = new List<string>();
-        var matches = Regex.Matches(content, @"<(p|h[1-6]|ul|ol|blockquote|pre)\b[^>]*>.*?</\1>|<hr\b[^>]*>", RegexOptions.IgnoreCase | RegexOptions.Singleline);
+        var matches = ParagraphMatchRegex.Matches(content);

        foreach (Match match in matches)
        {
@@ -165,9 +315,20 @@ public class EpubReaderService : IEpubReader

    private static string SanitizeParagraph(string html)
    {
-        var clean = Regex.Replace(html, @"<(style|script)\b[^>]*>.*?</\1>", "", RegexOptions.IgnoreCase | RegexOptions.Singleline);
-        clean = Regex.Replace(clean, @"<(?!/?(b|i|strong|em|h[1-6]|p|ul|ol|li|blockquote|pre|code|br|hr)\b)[^>]+>", "", RegexOptions.IgnoreCase);
-        clean = Regex.Replace(clean, @"<(b|i|strong|em|h[1-6]|p|ul|ol|li|blockquote|pre|code|br|hr)\b[^>]*>", "<$1>", RegexOptions.IgnoreCase);
+        var clean = StyleScriptRegex.Replace(html, "");
+        clean = WhitelistTagsRegex.Replace(clean, "");
+        clean = StripAttributesRegex.Replace(clean, "<$1>");
+
+        // Securely sanitize img tags by keeping ONLY src and alt attributes to prevent XSS (onerror, onload, style, etc.)
+        clean = ImgTagSanitizerRegex.Replace(clean, m =>
+        {
+            var srcMatch = SrcAttributeRegex.Match(m.Value);
+            var altMatch = AltAttributeRegex.Match(m.Value);
+            var srcAttr = srcMatch.Success ? $" src=\"{srcMatch.Groups["src"].Value}\"" : "";
+            var altAttr = altMatch.Success ? $" alt=\"{altMatch.Groups["alt"].Value}\"" : "";
+            return $"<img{srcAttr}{altAttr} />";
+        });
+
        clean = System.Net.WebUtility.HtmlDecode(clean);
        return clean.Trim();
    }