using System.Text.RegularExpressions; using FluentResults; using Microsoft.EntityFrameworkCore; using Microsoft.Extensions.Logging; using NexusReader.Application.Abstractions.Services; using NexusReader.Application.Queries.Reader; using NexusReader.Data.Persistence; using VersOne.Epub; namespace NexusReader.Infrastructure.Services; /// /// Reads and parses EPUB files from the storage path recorded in the database. /// public class EpubReaderService : IEpubReader { private readonly IDbContextFactory _dbContextFactory; private readonly ILogger _logger; private const int WordThreshold = 1000; private static readonly Regex ImageTagRegex = new(@"[^>]*?\bsrc=[""'])(?[^""']*?)(?[""'][^>]*?>)", RegexOptions.IgnoreCase | RegexOptions.Compiled); private static readonly Regex BodyMatchRegex = new(@"]*>(.*?)", RegexOptions.IgnoreCase | RegexOptions.Singleline | RegexOptions.Compiled); private static readonly Regex ParagraphMatchRegex = new(@"<(p|h[1-6]|ul|ol|blockquote|pre)\b[^>]*>.*?|]*>|]*>", RegexOptions.IgnoreCase | RegexOptions.Singleline | RegexOptions.Compiled); private static readonly Regex StyleScriptRegex = new(@"<(style|script)\b[^>]*>.*?", RegexOptions.IgnoreCase | RegexOptions.Singleline | RegexOptions.Compiled); private static readonly Regex WhitelistTagsRegex = new(@"<(?!/?(b|i|strong|em|h[1-6]|p|ul|ol|li|blockquote|pre|code|br|hr|img)\b)[^>]+>", RegexOptions.IgnoreCase | RegexOptions.Compiled); private static readonly Regex StripAttributesRegex = new(@"<(b|i|strong|em|h[1-6]|p|ul|ol|li|blockquote|pre|code|br|hr)\b[^>]*>", RegexOptions.IgnoreCase | RegexOptions.Compiled); private static readonly Regex ImgTagSanitizerRegex = new(@"]*>", RegexOptions.IgnoreCase | RegexOptions.Compiled); private static readonly Regex SrcAttributeRegex = new(@"\bsrc=[""'](?[^""']*)[""']", RegexOptions.IgnoreCase | RegexOptions.Compiled); private static readonly Regex AltAttributeRegex = new(@"\balt=[""'](?[^""']*)[""']", RegexOptions.IgnoreCase | RegexOptions.Compiled); public EpubReaderService( IDbContextFactory dbContextFactory, ILogger logger) { _dbContextFactory = dbContextFactory; _logger = logger; } /// public async Task> GetEpubContentAsync( Guid ebookId, int chapterIndex, string? userId = null, CancellationToken cancellationToken = default) { try { // 1. Resolve the file path from the database using var context = await _dbContextFactory.CreateDbContextAsync(cancellationToken); var ebook = await context.Ebooks .AsNoTracking() .FirstOrDefaultAsync( e => e.Id == ebookId && (userId == null || e.UserId == userId), cancellationToken); if (ebook == null) { return Result.Fail($"Ebook '{ebookId}' not found for user '{userId}'."); } // FilePath is stored as a web-relative path (e.g. "uploads/guid_title.epub"). // Resolve against the content root, then against the wwwroot sub-directory. var fullPath = ResolvePath(ebook.FilePath); if (fullPath == null || !File.Exists(fullPath)) { _logger.LogError("EPUB file for ebook {EbookId} not found at path '{FilePath}'.", ebookId, ebook.FilePath); return Result.Fail($"The EPUB file for this book could not be found on the server."); } // 2. Parse the EPUB using var bookRef = await EpubReader.OpenBookAsync(fullPath); var readingOrder = bookRef.GetReadingOrder(); if (readingOrder == null || !readingOrder.Any()) { return Result.Fail("The EPUB has no readable content files in ReadingOrder."); } if (chapterIndex < 0 || chapterIndex >= readingOrder.Count) { chapterIndex = 0; } var chapterRef = readingOrder[chapterIndex]; var navigation = bookRef.GetNavigation(); var chapterTitle = FindTitleInNavigation(navigation, chapterRef.FilePath) ?? Path.GetFileNameWithoutExtension(chapterRef.FilePath) ?? $"Chapter {chapterIndex + 1}"; var chapterContent = await chapterRef.ReadContentAsTextAsync(); // Rewrite relative image src URLs to use the server-side API endpoint chapterContent = RewriteImageUrls(chapterContent, ebookId, chapterRef.FilePath); // 3. Build content blocks var blocks = new List(); int totalWordCount = 0; int blockCounter = 0; var paragraphs = ExtractParagraphs(chapterContent); foreach (var p in paragraphs) { var sanitizedContent = SanitizeParagraph(p); if (string.IsNullOrWhiteSpace(sanitizedContent)) continue; blocks.Add(new TextSegmentBlock($"seg-{blockCounter++}", sanitizedContent)); int wordsInP = CountWords(sanitizedContent); totalWordCount += wordsInP; if (totalWordCount >= WordThreshold) { blocks.Add(CreateAiTrigger($"trigger-{blockCounter++}")); totalWordCount = 0; } } if (blocks.Any() && blocks.Last() is not AiActionTriggerBlock) { blocks.Add(CreateAiTrigger($"trigger-{blockCounter++}")); } return Result.Ok(new ReaderPageViewModel(blocks, chapterIndex, readingOrder.Count, chapterTitle, ebook.Id)); } catch (Exception ex) { _logger.LogError(ex, "Failed to process EPUB for ebook {EbookId}.", ebookId); return Result.Fail(new Error($"Failed to process EPUB: {ex.Message}").CausedBy(ex)); } } /// /// Attempts to resolve a web-relative storage path to an absolute filesystem path. /// Searches upward from the app base directory to handle both dev and production layouts. /// private static string? ResolvePath(string relativePath) { // Normalize forward-slashes to OS separator for file system access var normalized = relativePath.Replace('/', Path.DirectorySeparatorChar); var currentDir = new DirectoryInfo(AppDomain.CurrentDomain.BaseDirectory); while (currentDir != null) { var candidate = Path.Combine(currentDir.FullName, "wwwroot", normalized); if (File.Exists(candidate)) return candidate; // Also try src/NexusReader.Web/wwwroot (development layout) var devCandidate = Path.Combine(currentDir.FullName, "src", "NexusReader.Web", "wwwroot", normalized); if (File.Exists(devCandidate)) return devCandidate; currentDir = currentDir.Parent; } return null; } /// public async Task> GetEpubResourceAsync( Guid ebookId, string resourcePath, string? userId = null, CancellationToken cancellationToken = default) { try { using var context = await _dbContextFactory.CreateDbContextAsync(cancellationToken); var ebook = await context.Ebooks .AsNoTracking() .FirstOrDefaultAsync( e => e.Id == ebookId && (userId == null || e.UserId == userId), cancellationToken); if (ebook == null) { return Result.Fail($"Ebook '{ebookId}' not found."); } var fullPath = ResolvePath(ebook.FilePath); if (fullPath == null || !File.Exists(fullPath)) { return Result.Fail("EPUB file not found."); } using var bookRef = await EpubReader.OpenBookAsync(fullPath); var decodedPath = System.Net.WebUtility.UrlDecode(resourcePath); if (decodedPath.Contains("..") || decodedPath.Contains(":") || decodedPath.StartsWith("/") || decodedPath.StartsWith("\\")) { return Result.Fail("Invalid resource path."); } decodedPath = decodedPath.Replace('\\', '/').TrimStart('/'); EpubLocalContentFileRef? targetFile = null; if (bookRef.Content?.AllFiles?.Local != null) { foreach (var file in bookRef.Content.AllFiles.Local) { var filePath = file.FilePath?.Replace('\\', '/').TrimStart('/') ?? ""; var fileKey = file.Key?.Replace('\\', '/').TrimStart('/') ?? ""; if (filePath.Equals(decodedPath, StringComparison.OrdinalIgnoreCase) || fileKey.Equals(decodedPath, StringComparison.OrdinalIgnoreCase)) { targetFile = file; break; } } } if (targetFile != null) { if (targetFile is EpubLocalByteContentFileRef byteFile) { byte[] bytes = await byteFile.ReadContentAsync(); return Result.Ok(bytes); } else if (targetFile is EpubLocalTextContentFileRef textFile) { string text = await textFile.ReadContentAsync(); byte[] bytes = System.Text.Encoding.UTF8.GetBytes(text); return Result.Ok(bytes); } } return Result.Fail($"Resource '{resourcePath}' not found in EPUB."); } catch (Exception ex) { _logger.LogError(ex, "Failed to retrieve EPUB resource '{ResourcePath}' for ebook {EbookId}.", resourcePath, ebookId); return Result.Fail(new Error($"Failed to retrieve EPUB resource: {ex.Message}").CausedBy(ex)); } } private static string RewriteImageUrls(string html, Guid ebookId, string chapterPath) { if (string.IsNullOrEmpty(html)) return html; return ImageTagRegex.Replace(html, match => { var rawSrc = match.Groups["src"].Value; if (rawSrc.StartsWith("javascript:", StringComparison.OrdinalIgnoreCase)) { return ""; // Completely block script execution in image src } if (rawSrc.StartsWith("http://", StringComparison.OrdinalIgnoreCase) || rawSrc.StartsWith("https://", StringComparison.OrdinalIgnoreCase) || rawSrc.StartsWith("data:", StringComparison.OrdinalIgnoreCase)) { return match.Value; } var resolvedPath = ResolveRelativePath(chapterPath, rawSrc); var rewrittenSrc = $"/api/epub/{ebookId}/resource?path={System.Net.WebUtility.UrlEncode(resolvedPath)}"; return $"{match.Groups["before"].Value}{rewrittenSrc}{match.Groups["after"].Value}"; }); } private static string ResolveRelativePath(string basePath, string relativePath) { if (string.IsNullOrEmpty(relativePath)) return string.Empty; var decodedRelative = System.Net.WebUtility.UrlDecode(relativePath); var baseDir = Path.GetDirectoryName(basePath) ?? ""; baseDir = baseDir.Replace('\\', '/'); var combined = Path.Combine(baseDir, decodedRelative).Replace('\\', '/'); var segments = combined.Split('/'); var stack = new Stack(); foreach (var segment in segments) { if (segment == "." || string.IsNullOrEmpty(segment)) { continue; } if (segment == "..") { if (stack.Count > 0) { stack.Pop(); } } else { stack.Push(segment); } } return string.Join("/", stack.Reverse()); } private static List ExtractParagraphs(string html) { var bodyMatch = BodyMatchRegex.Match(html); var content = bodyMatch.Success ? bodyMatch.Groups[1].Value : html; var paragraphs = new List(); var matches = ParagraphMatchRegex.Matches(content); foreach (Match match in matches) { paragraphs.Add(match.Value); } if (paragraphs.Count == 0) { paragraphs = content.Split(new[] { "
", "
", "\n\n", "\r\n\r\n" }, StringSplitOptions.RemoveEmptyEntries).ToList(); } return paragraphs; } private static string SanitizeParagraph(string html) { var clean = StyleScriptRegex.Replace(html, ""); clean = WhitelistTagsRegex.Replace(clean, ""); clean = StripAttributesRegex.Replace(clean, "<$1>"); // Securely sanitize img tags by keeping ONLY src and alt attributes to prevent XSS (onerror, onload, style, etc.) clean = ImgTagSanitizerRegex.Replace(clean, m => { var srcMatch = SrcAttributeRegex.Match(m.Value); var altMatch = AltAttributeRegex.Match(m.Value); var srcAttr = srcMatch.Success ? $" src=\"{srcMatch.Groups["src"].Value}\"" : ""; var altAttr = altMatch.Success ? $" alt=\"{altMatch.Groups["alt"].Value}\"" : ""; return $""; }); clean = System.Net.WebUtility.HtmlDecode(clean); return clean.Trim(); } private static int CountWords(string text) { if (string.IsNullOrWhiteSpace(text)) return 0; return text.Split(new[] { ' ', '\r', '\n', '\t' }, StringSplitOptions.RemoveEmptyEntries).Length; } private static AiActionTriggerBlock CreateAiTrigger(string id) => new(id, "Wykryto ciekawy fragment! Czy chcesz, abym wygenerował podsumowanie lub quiz z tego rozdziału?", new List { "Podsumuj", "Generuj Quiz", "Pomiń" }); private static string? FindTitleInNavigation(IEnumerable navigation, string? filePath) { if (string.IsNullOrEmpty(filePath)) return null; var fileName = Path.GetFileName(filePath); foreach (var item in navigation) { if (item.Link?.ContentFilePath == filePath || item.Link?.ContentFilePath == fileName) return item.Title; if (item.NestedItems?.Any() == true) { var childTitle = FindTitleInNavigation(item.NestedItems, filePath); if (childTitle != null) return childTitle; } } return null; } }