177 lines
7.4 KiB
C#
177 lines
7.4 KiB
C#
using System.Text.RegularExpressions;
|
|
using FluentResults;
|
|
using MediatR;
|
|
using Microsoft.EntityFrameworkCore;
|
|
using Microsoft.Extensions.DependencyInjection;
|
|
using Microsoft.Extensions.Logging;
|
|
using NexusReader.Application.Abstractions.Messaging;
|
|
using NexusReader.Application.Abstractions.Persistence;
|
|
using NexusReader.Application.Abstractions.Services;
|
|
|
|
namespace NexusReader.Application.Commands.Library;
|
|
|
|
public record ProcessEbookCommand(
|
|
Guid EbookId,
|
|
string UserId,
|
|
string TenantId
|
|
) : ICommand<bool>;
|
|
|
|
public class ProcessEbookCommandHandler : IRequestHandler<ProcessEbookCommand, Result<bool>>
|
|
{
|
|
private readonly IEbookRepository _ebookRepository;
|
|
private readonly IKnowledgeService _knowledgeService;
|
|
private readonly IEpubExtractor _epubExtractor;
|
|
private readonly ISyncBroadcaster _broadcaster;
|
|
private readonly ILogger<ProcessEbookCommandHandler> _logger;
|
|
|
|
public ProcessEbookCommandHandler(
|
|
IEbookRepository ebookRepository,
|
|
IKnowledgeService knowledgeService,
|
|
IEpubExtractor epubExtractor,
|
|
ISyncBroadcaster broadcaster,
|
|
ILogger<ProcessEbookCommandHandler> logger)
|
|
{
|
|
_ebookRepository = ebookRepository;
|
|
_knowledgeService = knowledgeService;
|
|
_epubExtractor = epubExtractor;
|
|
_broadcaster = broadcaster;
|
|
_logger = logger;
|
|
}
|
|
|
|
public async Task<Result<bool>> Handle(ProcessEbookCommand request, CancellationToken cancellationToken)
|
|
{
|
|
_logger.LogInformation("[ProcessEbook] Starting background processing for Ebook: {EbookId}", request.EbookId);
|
|
|
|
try
|
|
{
|
|
await _broadcaster.BroadcastIngestionProgressAsync(request.UserId, "Wyszukiwanie e-booka w bazie danych...", 0.05, cancellationToken);
|
|
|
|
var ebook = await _ebookRepository.FindByIdAsync(request.EbookId, cancellationToken);
|
|
if (ebook == null)
|
|
{
|
|
_logger.LogError("[ProcessEbook] Ebook not found in database: {EbookId}", request.EbookId);
|
|
return Result.Fail<bool>($"Ebook nie znaleziony w bazie danych: {request.EbookId}");
|
|
}
|
|
|
|
_logger.LogInformation("[ProcessEbook] Extracting chapters text for Ebook: {Title} ({FilePath})", ebook.Title, ebook.FilePath);
|
|
await _broadcaster.BroadcastIngestionProgressAsync(request.UserId, "Otwieranie i parsowanie pliku EPUB...", 0.1, cancellationToken);
|
|
|
|
var extractionResult = await _epubExtractor.ExtractChaptersTextAsync(ebook.FilePath, cancellationToken);
|
|
if (extractionResult.IsFailed)
|
|
{
|
|
var errorMsg = extractionResult.Errors.FirstOrDefault()?.Message ?? "Failed to extract text chapters.";
|
|
_logger.LogError("[ProcessEbook] Extraction failed: {Error}", errorMsg);
|
|
return Result.Fail<bool>(extractionResult.Errors);
|
|
}
|
|
|
|
var chapters = extractionResult.Value;
|
|
if (chapters == null || !chapters.Any())
|
|
{
|
|
_logger.LogWarning("[ProcessEbook] EPUB has no readable content files: {EbookId}", request.EbookId);
|
|
return Result.Fail<bool>("EPUB nie zawiera czytelnych rozdziałów.");
|
|
}
|
|
|
|
int totalChapters = chapters.Count;
|
|
_logger.LogInformation("[ProcessEbook] Processing {Count} chapters for Ebook: {Title}", totalChapters, ebook.Title);
|
|
|
|
await _broadcaster.BroadcastIngestionProgressAsync(request.UserId, $"Analizowanie struktury ({totalChapters} rozdziałów)...", 0.15, cancellationToken);
|
|
|
|
int processedChapters = 0;
|
|
|
|
for (int i = 0; i < totalChapters; i++)
|
|
{
|
|
var cleanText = chapters[i];
|
|
|
|
if (cleanText.Length < 100)
|
|
{
|
|
_logger.LogInformation("[ProcessEbook] Skipping chapter {Index} (text too short: {Length} chars)", i, cleanText.Length);
|
|
processedChapters++;
|
|
continue;
|
|
}
|
|
|
|
// Chunk the text to maintain granular Knowledge Units
|
|
var chunks = ChunkText(cleanText, 3000);
|
|
_logger.LogInformation("[ProcessEbook] Chapter {Index} split into {ChunkCount} chunk(s)", i, chunks.Count);
|
|
|
|
foreach (var chunk in chunks)
|
|
{
|
|
try
|
|
{
|
|
// Invoke GetKnowledgeMapAsync to extract, embed, and upsert knowledge units
|
|
var result = await _knowledgeService.GetKnowledgeMapAsync(chunk, request.TenantId, request.EbookId, cancellationToken);
|
|
if (result.IsFailed)
|
|
{
|
|
_logger.LogWarning("[ProcessEbook] Failed to generate knowledge map for a chunk of chapter {Index}: {Error}", i, result.Errors.FirstOrDefault()?.Message);
|
|
}
|
|
}
|
|
catch (Exception ex)
|
|
{
|
|
_logger.LogError(ex, "[ProcessEbook] Exception during AI vectorization of chapter {Index} chunk", i);
|
|
}
|
|
}
|
|
|
|
processedChapters++;
|
|
double progress = 0.15 + (0.75 * processedChapters / totalChapters);
|
|
await _broadcaster.BroadcastIngestionProgressAsync(
|
|
request.UserId,
|
|
$"Przetwarzanie rozdziału {processedChapters} z {totalChapters} przez AI...",
|
|
progress,
|
|
cancellationToken);
|
|
}
|
|
|
|
// Mark the ebook as ready
|
|
ebook.IsReadyForReading = true;
|
|
await _ebookRepository.SaveChangesAsync(cancellationToken);
|
|
|
|
_logger.LogInformation("[ProcessEbook] Ingestion and vector indexing completed for: {Title}", ebook.Title);
|
|
|
|
await _broadcaster.BroadcastIngestionProgressAsync(
|
|
request.UserId,
|
|
"Indeksowanie wektorowe e-booka przez Nexus AI zakończone pomyślnie!",
|
|
1.0,
|
|
cancellationToken);
|
|
|
|
return Result.Ok(true);
|
|
}
|
|
catch (Exception ex)
|
|
{
|
|
_logger.LogError(ex, "[ProcessEbook] Critical error during background EPUB vectorization of ebook {EbookId}", request.EbookId);
|
|
await _broadcaster.BroadcastIngestionProgressAsync(
|
|
request.UserId,
|
|
$"Błąd indeksowania: {ex.Message}",
|
|
1.0,
|
|
cancellationToken);
|
|
return Result.Fail<bool>(new Error("Wystąpił błąd podczas indeksowania e-booka przez AI").CausedBy(ex));
|
|
}
|
|
}
|
|
|
|
private static List<string> ChunkText(string text, int maxWords = 3000)
|
|
{
|
|
var words = text.Split(' ', StringSplitOptions.RemoveEmptyEntries);
|
|
var chunks = new List<string>();
|
|
if (words.Length <= maxWords)
|
|
{
|
|
chunks.Add(text);
|
|
return chunks;
|
|
}
|
|
var currentChunk = new List<string>();
|
|
int count = 0;
|
|
foreach (var word in words)
|
|
{
|
|
currentChunk.Add(word);
|
|
count++;
|
|
if (count >= maxWords)
|
|
{
|
|
chunks.Add(string.Join(" ", currentChunk));
|
|
currentChunk.Clear();
|
|
count = 0;
|
|
}
|
|
}
|
|
if (currentChunk.Any())
|
|
{
|
|
chunks.Add(string.Join(" ", currentChunk));
|
|
}
|
|
return chunks;
|
|
}
|
|
}
|