feat: implement semantic search, knowledge unit extraction, and visualization components

This commit is contained in:
2026-05-03 15:59:30 +02:00
parent 94ecc7a404
commit 1f187b5125
24 changed files with 844 additions and 21 deletions
@@ -1,21 +1,26 @@
using Microsoft.AspNetCore.Identity.EntityFrameworkCore;
using Microsoft.EntityFrameworkCore;
using NexusReader.Domain.Entities;
using NexusReader.Application.Abstractions.Persistence;
namespace NexusReader.Infrastructure.Persistence;
public class AppDbContext : IdentityDbContext<NexusUser>
public class AppDbContext : IdentityDbContext<NexusUser>, IApplicationDbContext
{
public AppDbContext(DbContextOptions<AppDbContext> options) : base(options)
{
}
public DbSet<SemanticKnowledgeCache> SemanticKnowledgeCache => Set<SemanticKnowledgeCache>();
public DbSet<KnowledgeUnit> KnowledgeUnits => Set<KnowledgeUnit>();
public DbSet<KnowledgeUnitLink> KnowledgeUnitLinks => Set<KnowledgeUnitLink>();
public DbSet<Ebook> Ebooks => Set<Ebook>();
public DbSet<QuizResult> QuizResults => Set<QuizResult>();
protected override void OnModelCreating(ModelBuilder modelBuilder)
{
modelBuilder.HasPostgresExtension("pgvector");
modelBuilder.Entity<NexusUser>(entity =>
{
entity.Property(u => u.LastReadPageId).HasMaxLength(255);
@@ -28,6 +33,30 @@ public class AppDbContext : IdentityDbContext<NexusUser>
{
entity.HasKey(e => e.ContentHash);
entity.HasIndex(e => e.ContentHash).IsUnique();
entity.HasIndex(e => e.TenantId);
entity.Property(e => e.Vector).HasColumnType("vector(1536)"); // Standard for many models
});
modelBuilder.Entity<KnowledgeUnit>(entity =>
{
entity.HasKey(e => e.Id);
entity.HasIndex(e => e.TenantId);
entity.HasIndex(e => e.SourceId);
entity.Property(e => e.Vector).HasColumnType("vector(768)"); // text-embedding-004
});
modelBuilder.Entity<KnowledgeUnitLink>(entity =>
{
entity.HasKey(e => e.Id);
entity.HasOne(e => e.SourceUnit)
.WithMany(u => u.OutgoingLinks)
.HasForeignKey(e => e.SourceUnitId)
.OnDelete(DeleteBehavior.Cascade);
entity.HasOne(e => e.TargetUnit)
.WithMany(u => u.IncomingLinks)
.HasForeignKey(e => e.TargetUnitId)
.OnDelete(DeleteBehavior.Cascade);
});
modelBuilder.Entity<Ebook>(entity =>
@@ -12,12 +12,14 @@ using Polly;
using Polly.Registry;
using Microsoft.Extensions.Options;
using NexusReader.Infrastructure.Configuration;
using Pgvector.EntityFrameworkCore;
namespace NexusReader.Infrastructure.Services;
public class KnowledgeService : IKnowledgeService
{
private readonly IChatClient _chatClient;
private readonly IEmbeddingGenerator<string, Embedding<float>> _embeddingGenerator;
private readonly AppDbContext _dbContext;
private readonly ResiliencePipeline _retryPipeline;
private readonly AiSettings _settings;
@@ -25,12 +27,14 @@ public class KnowledgeService : IKnowledgeService
private const string PromptVersion = "1.0";
public KnowledgeService(
IChatClient chatClient,
IChatClient chatClient,
IEmbeddingGenerator<string, Embedding<float>> embeddingGenerator,
AppDbContext dbContext,
ResiliencePipelineProvider<string> pipelineProvider,
IOptions<AiSettings> settings)
{
_chatClient = chatClient;
_embeddingGenerator = embeddingGenerator;
_dbContext = dbContext;
_retryPipeline = pipelineProvider.GetPipeline("ai-retry");
_settings = settings.Value;
@@ -54,6 +58,11 @@ public class KnowledgeService : IKnowledgeService
return await GetKnowledgeInternalAsync(text, PromptRegistry.SummaryAndQuizPrompt, "summary_quiz", cancellationToken);
}
public async Task<Result<KnowledgePacket>> GetKnowledgeMapAsync(string text, CancellationToken cancellationToken = default)
{
return await GetKnowledgeInternalAsync(text, PromptRegistry.KM_ExtractionPrompt, "km_map", cancellationToken);
}
private async Task<Result<KnowledgePacket>> GetKnowledgeInternalAsync(string text, string systemPrompt, string cacheSuffix, CancellationToken cancellationToken)
{
if (string.IsNullOrWhiteSpace(text))
@@ -115,18 +124,47 @@ public class KnowledgeService : IKnowledgeService
var knowledgePacket = JsonSerializer.Deserialize<KnowledgePacket>(jsonResponse, new JsonSerializerOptions { PropertyNameCaseInsensitive = true });
if (knowledgePacket == null) return Result.Fail("Failed to deserialize AI response.");
// 3. Save to Cache
// 3. Generate Embedding if not present
float[]? vector = null;
try
{
var embeddingResponse = await _retryPipeline.ExecuteAsync(async ct =>
await _embeddingGenerator.GenerateAsync(new[] { normalizedText }, cancellationToken: ct), cancellationToken);
vector = embeddingResponse.First().Vector.ToArray();
}
catch (Exception ex)
{
Console.WriteLine($"[KnowledgeService] Embedding Error: {ex.Message}");
// We continue even if embedding fails, as the primary goal was knowledge extraction
}
// 4. Save to Cache
var cacheEntry = new SemanticKnowledgeCache
{
ContentHash = hash,
JsonData = jsonResponse,
OriginalText = normalizedText,
ModelId = _settings.Model,
PromptVersion = PromptVersion,
TenantId = "global", // Default for shared cache, should be overridden by caller context if possible
Vector = vector,
CreatedAt = DateTime.UtcNow
};
if (cached == null) _dbContext.SemanticKnowledgeCache.Add(cacheEntry);
else { cached.JsonData = jsonResponse; cached.CreatedAt = DateTime.UtcNow; }
else
{
cached.JsonData = jsonResponse;
cached.OriginalText = normalizedText;
cached.Vector = vector;
cached.CreatedAt = DateTime.UtcNow;
}
// 5. Process KM-RAG Units and Links if present
if (knowledgePacket.Units.Any())
{
await ProcessKnowledgeUnitsAsync(knowledgePacket, "global", cancellationToken);
}
await _dbContext.SaveChangesAsync(cancellationToken);
return Result.Ok(knowledgePacket);
@@ -143,6 +181,75 @@ public class KnowledgeService : IKnowledgeService
}
}
private async Task ProcessKnowledgeUnitsAsync(KnowledgePacket packet, string tenantId, CancellationToken cancellationToken)
{
foreach (var unitDto in packet.Units)
{
var unitId = unitDto.Id;
var existing = await _dbContext.KnowledgeUnits.FindAsync(new object[] { unitId }, cancellationToken);
var unit = existing ?? new KnowledgeUnit { Id = unitId, TenantId = tenantId };
unit.Type = Enum.TryParse<NexusReader.Domain.Enums.KnowledgeUnitType>(unitDto.Type, true, out var type) ? type : NexusReader.Domain.Enums.KnowledgeUnitType.Snippet;
unit.Content = unitDto.Content;
unit.SourceId = "extracted"; // Should be passed from context
unit.MetadataJson = JsonSerializer.Serialize(unitDto.Metadata);
// Generate unit-specific embedding for granular retrieval
try
{
var emb = await _embeddingGenerator.GenerateAsync(new[] { unit.Content }, cancellationToken: cancellationToken);
unit.Vector = emb.First().Vector.ToArray();
}
catch { /* Ignore embedding errors for now */ }
if (existing == null) _dbContext.KnowledgeUnits.Add(unit);
}
foreach (var linkDto in packet.Links)
{
var link = new KnowledgeUnitLink
{
SourceUnitId = linkDto.Source,
TargetUnitId = linkDto.Target,
RelationType = linkDto.Relation
};
_dbContext.KnowledgeUnitLinks.Add(link);
}
}
public async Task<Result<List<RelevantContext>>> GetRelevantContextAsync(string query, string tenantId, CancellationToken cancellationToken = default)
{
if (string.IsNullOrWhiteSpace(query)) return Result.Fail("Query is empty.");
try
{
// 1. Generate embedding for query
var embeddingResponse = await _retryPipeline.ExecuteAsync(async ct =>
await _embeddingGenerator.GenerateAsync(new[] { query }, cancellationToken: ct), cancellationToken);
var queryVector = embeddingResponse.First().Vector.ToArray();
// 2. Search using pgvector
var results = await _dbContext.SemanticKnowledgeCache
.AsNoTracking()
.Where(x => (x.TenantId == tenantId || x.TenantId == "global") && x.Vector != null)
.OrderBy(x => x.Vector!.CosineDistance(queryVector))
.Take(5)
.Select(x => new RelevantContext
{
Text = x.OriginalText,
SourceId = x.ContentHash,
Confidence = 1 - x.Vector!.CosineDistance(queryVector)
})
.ToListAsync(cancellationToken);
return Result.Ok(results);
}
catch (Exception ex)
{
return Result.Fail(new Error("Failed to retrieve relevant context").CausedBy(ex));
}
}
public async Task<Result> ClearCacheAsync(CancellationToken cancellationToken = default)
{
try
@@ -21,4 +21,13 @@ public static class PromptRegistry
public const string SummaryAndQuizPrompt =
"You are an expert educator. Provide a concise summary of the text and generate a challenging quiz (3-5 questions). " +
"Return ONLY minified JSON. Schema: { \"summary\": \"string\", \"quizzes\": [ { \"question\": \"string\", \"options\": [ \"string\" ], \"correct_index\": 0 } ] }";
public const string KM_ExtractionPrompt =
"You are an expert at Knowledge Engineering. Segment the provided text into discrete Knowledge Units. " +
"Identify 'units' (sections, tables, definitions, rules) and 'links' (how they relate). " +
"CRITICAL: Units must be granular. " +
"Schema: { " +
"\"units\": [ { \"id\": \"string\", \"type\": \"Section|Table|Definition|Rule\", \"content\": \"string\", \"metadata\": { \"page\": 0 } } ], " +
"\"links\": [ { \"source\": \"string\", \"target\": \"string\", \"relation\": \"Next|Defines|Contains|References\" } ] " +
"}.";
}