chore: checkpoint ai search comments and i18n foundation

This commit is contained in:
2026-03-28 17:17:31 +08:00
parent d18a709987
commit ec96d91548
71 changed files with 9494 additions and 423 deletions

993
backend/src/services/ai.rs Normal file
View File

@@ -0,0 +1,993 @@
use chrono::{DateTime, Utc};
use fastembed::{
InitOptionsUserDefined, Pooling, TextEmbedding, TokenizerFiles, UserDefinedEmbeddingModel,
};
use loco_rs::prelude::*;
use reqwest::Client;
use sea_orm::{
ActiveModelTrait, ConnectionTrait, DbBackend, EntityTrait, FromQueryResult, IntoActiveModel,
PaginatorTrait, QueryOrder, Set, Statement,
};
use serde::Serialize;
use serde_json::{json, Value};
use std::fs;
use std::path::{Path, PathBuf};
use std::sync::{Mutex, OnceLock};
use crate::{
models::_entities::{ai_chunks, site_settings},
services::content,
};
const DEFAULT_AI_PROVIDER: &str = "newapi";
const DEFAULT_AI_API_BASE: &str = "http://localhost:8317/v1";
const DEFAULT_AI_API_KEY: &str = "your-api-key-1";
const DEFAULT_CHAT_MODEL: &str = "gpt-5.4";
const DEFAULT_REASONING_EFFORT: &str = "medium";
const DEFAULT_DISABLE_RESPONSE_STORAGE: bool = true;
const DEFAULT_TOP_K: usize = 4;
const DEFAULT_CHUNK_SIZE: usize = 1200;
const DEFAULT_SYSTEM_PROMPT: &str =
"你是这个博客的站内 AI 助手。请严格基于提供的博客上下文回答,优先给出准确结论,再补充细节;如果上下文不足,请明确说明。";
const EMBEDDING_BATCH_SIZE: usize = 32;
const EMBEDDING_DIMENSION: usize = 384;
const LOCAL_EMBEDDING_MODEL_LABEL: &str = "fastembed / local all-MiniLM-L6-v2";
const LOCAL_EMBEDDING_CACHE_DIR: &str = "storage/ai_embedding_models/all-minilm-l6-v2";
const LOCAL_EMBEDDING_BASE_URL: &str =
"https://huggingface.co/Qdrant/all-MiniLM-L6-v2-onnx/resolve/main";
const LOCAL_EMBEDDING_FILES: [&str; 5] = [
"model.onnx",
"tokenizer.json",
"config.json",
"special_tokens_map.json",
"tokenizer_config.json",
];
static TEXT_EMBEDDING_MODEL: OnceLock<Mutex<TextEmbedding>> = OnceLock::new();
#[derive(Clone, Debug)]
struct AiRuntimeSettings {
raw: site_settings::Model,
provider: String,
api_base: Option<String>,
api_key: Option<String>,
chat_model: String,
system_prompt: String,
top_k: usize,
chunk_size: usize,
}
#[derive(Clone, Debug)]
struct ChunkDraft {
source_slug: String,
source_title: Option<String>,
source_path: Option<String>,
source_type: String,
chunk_index: i32,
content: String,
content_preview: Option<String>,
word_count: Option<i32>,
}
#[derive(Clone, Debug)]
struct ScoredChunk {
score: f64,
row: ai_chunks::Model,
}
#[derive(Clone, Debug, FromQueryResult)]
struct SimilarChunkRow {
source_slug: String,
source_title: Option<String>,
chunk_index: i32,
content: String,
content_preview: Option<String>,
word_count: Option<i32>,
score: f64,
}
#[derive(Clone, Copy, Debug)]
enum EmbeddingKind {
Passage,
Query,
}
#[derive(Clone, Debug, Serialize)]
pub struct AiSource {
pub slug: String,
pub title: String,
pub excerpt: String,
pub score: f64,
pub chunk_index: i32,
}
#[derive(Clone, Debug)]
pub struct AiAnswer {
pub answer: String,
pub sources: Vec<AiSource>,
pub indexed_chunks: usize,
pub last_indexed_at: Option<DateTime<Utc>>,
}
#[derive(Clone, Debug)]
pub(crate) struct AiProviderRequest {
pub(crate) provider: String,
pub(crate) api_base: String,
pub(crate) api_key: String,
pub(crate) chat_model: String,
pub(crate) system_prompt: String,
pub(crate) prompt: String,
}
#[derive(Clone, Debug)]
pub(crate) struct PreparedAiAnswer {
pub(crate) question: String,
pub(crate) provider_request: Option<AiProviderRequest>,
pub(crate) immediate_answer: Option<String>,
pub(crate) sources: Vec<AiSource>,
pub(crate) indexed_chunks: usize,
pub(crate) last_indexed_at: Option<DateTime<Utc>>,
}
#[derive(Clone, Debug)]
pub struct AiIndexSummary {
pub indexed_chunks: usize,
pub last_indexed_at: Option<DateTime<Utc>>,
}
fn trim_to_option(value: Option<String>) -> Option<String> {
value.and_then(|item| {
let trimmed = item.trim().to_string();
if trimmed.is_empty() {
None
} else {
Some(trimmed)
}
})
}
fn preview_text(content: &str, limit: usize) -> Option<String> {
let flattened = content
.split_whitespace()
.collect::<Vec<_>>()
.join(" ")
.trim()
.to_string();
if flattened.is_empty() {
return None;
}
let preview = flattened.chars().take(limit).collect::<String>();
Some(preview)
}
fn build_endpoint(api_base: &str, path: &str) -> String {
format!(
"{}/{}",
api_base.trim_end_matches('/'),
path.trim_start_matches('/')
)
}
fn local_embedding_dir() -> PathBuf {
PathBuf::from(LOCAL_EMBEDDING_CACHE_DIR)
}
fn download_embedding_file(
client: &reqwest::blocking::Client,
directory: &Path,
file_name: &str,
) -> Result<()> {
let target_path = directory.join(file_name);
if target_path.exists() {
return Ok(());
}
let url = format!("{LOCAL_EMBEDDING_BASE_URL}/{file_name}");
let bytes = client
.get(url)
.send()
.and_then(reqwest::blocking::Response::error_for_status)
.map_err(|error| Error::BadRequest(format!("下载本地 embedding 文件失败: {error}")))?
.bytes()
.map_err(|error| Error::BadRequest(format!("读取本地 embedding 文件失败: {error}")))?;
fs::write(&target_path, &bytes)
.map_err(|error| Error::BadRequest(format!("写入本地 embedding 文件失败: {error}")))?;
Ok(())
}
fn ensure_local_embedding_files() -> Result<PathBuf> {
let directory = local_embedding_dir();
fs::create_dir_all(&directory)
.map_err(|error| Error::BadRequest(format!("创建本地 embedding 目录失败: {error}")))?;
let client = reqwest::blocking::Client::builder()
.build()
.map_err(|error| {
Error::BadRequest(format!("创建本地 embedding 下载客户端失败: {error}"))
})?;
for file_name in LOCAL_EMBEDDING_FILES {
download_embedding_file(&client, &directory, file_name)?;
}
Ok(directory)
}
fn load_local_embedding_model() -> Result<TextEmbedding> {
let directory = ensure_local_embedding_files()?;
let tokenizer_files = TokenizerFiles {
tokenizer_file: fs::read(directory.join("tokenizer.json"))
.map_err(|error| Error::BadRequest(format!("读取 tokenizer.json 失败: {error}")))?,
config_file: fs::read(directory.join("config.json"))
.map_err(|error| Error::BadRequest(format!("读取 config.json 失败: {error}")))?,
special_tokens_map_file: fs::read(directory.join("special_tokens_map.json")).map_err(
|error| Error::BadRequest(format!("读取 special_tokens_map.json 失败: {error}")),
)?,
tokenizer_config_file: fs::read(directory.join("tokenizer_config.json")).map_err(
|error| Error::BadRequest(format!("读取 tokenizer_config.json 失败: {error}")),
)?,
};
let model = UserDefinedEmbeddingModel::new(
fs::read(directory.join("model.onnx"))
.map_err(|error| Error::BadRequest(format!("读取 model.onnx 失败: {error}")))?,
tokenizer_files,
)
.with_pooling(Pooling::Mean);
TextEmbedding::try_new_from_user_defined(model, InitOptionsUserDefined::default())
.map_err(|error| Error::BadRequest(format!("本地 embedding 模型初始化失败: {error}")))
}
fn local_embedding_engine() -> Result<&'static Mutex<TextEmbedding>> {
if let Some(model) = TEXT_EMBEDDING_MODEL.get() {
return Ok(model);
}
let model = load_local_embedding_model()?;
let _ = TEXT_EMBEDDING_MODEL.set(Mutex::new(model));
TEXT_EMBEDDING_MODEL
.get()
.ok_or_else(|| Error::BadRequest("本地 embedding 模型未能成功缓存".to_string()))
}
fn vector_literal(embedding: &[f64]) -> Result<String> {
if embedding.len() != EMBEDDING_DIMENSION {
return Err(Error::BadRequest(format!(
"embedding 维度异常,期望 {EMBEDDING_DIMENSION},实际 {}",
embedding.len()
)));
}
Ok(format!(
"[{}]",
embedding
.iter()
.map(|value| value.to_string())
.collect::<Vec<_>>()
.join(",")
))
}
fn prepare_embedding_text(kind: EmbeddingKind, text: &str) -> String {
match kind {
EmbeddingKind::Passage | EmbeddingKind::Query => text.trim().to_string(),
}
}
fn split_long_text(text: &str, chunk_size: usize) -> Vec<String> {
let mut parts = Vec::new();
let mut current = String::new();
for line in text.lines() {
let candidate = if current.is_empty() {
line.to_string()
} else {
format!("{current}\n{line}")
};
if candidate.chars().count() > chunk_size && !current.is_empty() {
parts.push(current.trim().to_string());
current = line.to_string();
} else {
current = candidate;
}
}
if !current.trim().is_empty() {
parts.push(current.trim().to_string());
}
parts
}
fn build_chunks(posts: &[content::MarkdownPost], chunk_size: usize) -> Vec<ChunkDraft> {
let mut chunks = Vec::new();
for post in posts.iter().filter(|post| post.published) {
let mut sections = Vec::new();
sections.push(format!("# {}", post.title));
if let Some(description) = post
.description
.as_deref()
.filter(|value| !value.trim().is_empty())
{
sections.push(description.trim().to_string());
}
sections.push(post.content.trim().to_string());
let source_text = sections
.into_iter()
.filter(|item| !item.trim().is_empty())
.collect::<Vec<_>>()
.join("\n\n");
let paragraphs = source_text
.split("\n\n")
.map(str::trim)
.filter(|value| !value.is_empty())
.collect::<Vec<_>>();
let mut buffer = String::new();
let mut chunk_index = 0_i32;
for paragraph in paragraphs {
if paragraph.chars().count() > chunk_size {
if !buffer.trim().is_empty() {
chunks.push(ChunkDraft {
source_slug: post.slug.clone(),
source_title: Some(post.title.clone()),
source_path: Some(post.file_path.clone()),
source_type: "post".to_string(),
chunk_index,
content: buffer.trim().to_string(),
content_preview: preview_text(&buffer, 180),
word_count: Some(buffer.split_whitespace().count() as i32),
});
chunk_index += 1;
buffer.clear();
}
for part in split_long_text(paragraph, chunk_size) {
if part.trim().is_empty() {
continue;
}
chunks.push(ChunkDraft {
source_slug: post.slug.clone(),
source_title: Some(post.title.clone()),
source_path: Some(post.file_path.clone()),
source_type: "post".to_string(),
chunk_index,
content_preview: preview_text(&part, 180),
word_count: Some(part.split_whitespace().count() as i32),
content: part,
});
chunk_index += 1;
}
continue;
}
let candidate = if buffer.is_empty() {
paragraph.to_string()
} else {
format!("{buffer}\n\n{paragraph}")
};
if candidate.chars().count() > chunk_size && !buffer.trim().is_empty() {
chunks.push(ChunkDraft {
source_slug: post.slug.clone(),
source_title: Some(post.title.clone()),
source_path: Some(post.file_path.clone()),
source_type: "post".to_string(),
chunk_index,
content_preview: preview_text(&buffer, 180),
word_count: Some(buffer.split_whitespace().count() as i32),
content: buffer.trim().to_string(),
});
chunk_index += 1;
buffer = paragraph.to_string();
} else {
buffer = candidate;
}
}
if !buffer.trim().is_empty() {
chunks.push(ChunkDraft {
source_slug: post.slug.clone(),
source_title: Some(post.title.clone()),
source_path: Some(post.file_path.clone()),
source_type: "post".to_string(),
chunk_index,
content_preview: preview_text(&buffer, 180),
word_count: Some(buffer.split_whitespace().count() as i32),
content: buffer.trim().to_string(),
});
}
}
chunks
}
async fn request_json(client: &Client, url: &str, api_key: &str, payload: Value) -> Result<Value> {
let response = client
.post(url)
.bearer_auth(api_key)
.header("Accept", "application/json")
.json(&payload)
.send()
.await
.map_err(|error| Error::BadRequest(format!("AI request failed: {error}")))?;
let status = response.status();
let body = response
.text()
.await
.map_err(|error| Error::BadRequest(format!("AI response read failed: {error}")))?;
if !status.is_success() {
return Err(Error::BadRequest(format!(
"AI provider returned {status}: {body}"
)));
}
serde_json::from_str(&body)
.map_err(|error| Error::BadRequest(format!("AI response parse failed: {error}")))
}
fn provider_uses_responses(provider: &str) -> bool {
provider.eq_ignore_ascii_case("newapi")
}
async fn embed_texts_locally(inputs: Vec<String>, kind: EmbeddingKind) -> Result<Vec<Vec<f64>>> {
tokio::task::spawn_blocking(move || {
let model = local_embedding_engine()?;
let prepared = inputs
.iter()
.map(|item| prepare_embedding_text(kind, item))
.collect::<Vec<_>>();
let mut guard = model.lock().map_err(|_| {
Error::BadRequest("本地 embedding 模型当前不可用,请稍后重试".to_string())
})?;
let embeddings = guard
.embed(prepared, Some(EMBEDDING_BATCH_SIZE))
.map_err(|error| Error::BadRequest(format!("本地 embedding 生成失败: {error}")))?;
Ok(embeddings
.into_iter()
.map(|embedding| embedding.into_iter().map(f64::from).collect::<Vec<_>>())
.collect::<Vec<_>>())
})
.await
.map_err(|error| Error::BadRequest(format!("本地 embedding 任务执行失败: {error}")))?
}
fn extract_message_content(value: &Value) -> Option<String> {
if let Some(content) = value
.get("choices")
.and_then(Value::as_array)
.and_then(|choices| choices.first())
.and_then(|choice| choice.get("message"))
.and_then(|message| message.get("content"))
{
if let Some(text) = content.as_str() {
return Some(text.trim().to_string());
}
if let Some(parts) = content.as_array() {
let merged = parts
.iter()
.filter_map(|part| part.get("text").and_then(Value::as_str))
.collect::<Vec<_>>()
.join("\n");
if !merged.trim().is_empty() {
return Some(merged.trim().to_string());
}
}
}
None
}
fn merge_text_segments(parts: Vec<String>) -> Option<String> {
let merged = parts
.into_iter()
.filter_map(|part| {
let trimmed = part.trim().to_string();
if trimmed.is_empty() {
None
} else {
Some(trimmed)
}
})
.collect::<Vec<_>>()
.join("\n");
if merged.trim().is_empty() {
None
} else {
Some(merged)
}
}
fn extract_response_output(value: &Value) -> Option<String> {
if let Some(text) = value.get("output_text").and_then(Value::as_str) {
let trimmed = text.trim();
if !trimmed.is_empty() {
return Some(trimmed.to_string());
}
}
let output_items = value.get("output").and_then(Value::as_array)?;
let mut segments = Vec::new();
for item in output_items {
let Some(content_items) = item.get("content").and_then(Value::as_array) else {
continue;
};
for content in content_items {
if let Some(text) = content.get("text").and_then(Value::as_str) {
segments.push(text.to_string());
continue;
}
if let Some(text) = content
.get("output_text")
.and_then(|output_text| output_text.get("text"))
.and_then(Value::as_str)
{
segments.push(text.to_string());
}
}
}
merge_text_segments(segments)
}
fn build_chat_prompt(question: &str, matches: &[ScoredChunk]) -> String {
let context_blocks = matches
.iter()
.enumerate()
.map(|(index, item)| {
format!(
"[资料 {}]\n标题: {}\nSlug: {}\n相似度: {:.4}\n内容:\n{}",
index + 1,
item.row
.source_title
.as_deref()
.filter(|value| !value.trim().is_empty())
.unwrap_or("未命名内容"),
item.row.source_slug,
item.score,
item.row.content
)
})
.collect::<Vec<_>>()
.join("\n\n");
format!(
"请仅根据下面提供的资料回答用户问题。\n\
如果资料不足以支撑结论,请直接说明“我在当前博客资料里没有找到足够信息”。\n\
回答要求:\n\
1. 使用中文。\n\
2. 使用 Markdown 输出,必要时用短列表或小标题,不要输出 HTML。\n\
3. 先给直接结论,再补充关键点,整体尽量精炼。\n\
4. 不要编造未在资料中出现的事实。\n\
5. 如果回答引用了具体资料,可自然地提及文章标题。\n\n\
用户问题:{question}\n\n\
可用资料:\n{context_blocks}"
)
}
fn build_sources(matches: &[ScoredChunk]) -> Vec<AiSource> {
matches
.iter()
.map(|item| AiSource {
slug: item.row.source_slug.clone(),
title: item
.row
.source_title
.as_deref()
.filter(|value| !value.trim().is_empty())
.unwrap_or("未命名内容")
.to_string(),
excerpt: item
.row
.content_preview
.clone()
.unwrap_or_else(|| preview_text(&item.row.content, 180).unwrap_or_default()),
score: (item.score * 10000.0).round() / 10000.0,
chunk_index: item.row.chunk_index,
})
.collect::<Vec<_>>()
}
pub(crate) fn build_provider_payload(request: &AiProviderRequest, stream: bool) -> Value {
if provider_uses_responses(&request.provider) {
json!({
"model": request.chat_model,
"input": [
{
"role": "system",
"content": [
{
"type": "input_text",
"text": request.system_prompt
}
]
},
{
"role": "user",
"content": [
{
"type": "input_text",
"text": request.prompt
}
]
}
],
"reasoning": {
"effort": DEFAULT_REASONING_EFFORT
},
"max_output_tokens": 520,
"store": !DEFAULT_DISABLE_RESPONSE_STORAGE,
"stream": stream
})
} else {
json!({
"model": request.chat_model,
"temperature": 0.2,
"stream": stream,
"messages": [
{
"role": "system",
"content": request.system_prompt,
},
{
"role": "user",
"content": request.prompt,
}
]
})
}
}
pub(crate) fn build_provider_url(request: &AiProviderRequest) -> String {
let path = if provider_uses_responses(&request.provider) {
"/responses"
} else {
"/chat/completions"
};
build_endpoint(&request.api_base, path)
}
pub(crate) fn extract_provider_text(value: &Value) -> Option<String> {
extract_response_output(value).or_else(|| extract_message_content(value))
}
async fn request_chat_answer(request: &AiProviderRequest) -> Result<String> {
let client = Client::new();
let response = request_json(
&client,
&build_provider_url(request),
&request.api_key,
build_provider_payload(request, false),
)
.await?;
extract_provider_text(&response).ok_or_else(|| {
Error::BadRequest("AI chat response did not contain readable content".to_string())
})
}
pub(crate) async fn prepare_answer(ctx: &AppContext, question: &str) -> Result<PreparedAiAnswer> {
let trimmed_question = question.trim();
if trimmed_question.is_empty() {
return Err(Error::BadRequest("问题不能为空".to_string()));
}
let settings = load_runtime_settings(ctx, true).await?;
let (matches, indexed_chunks, last_indexed_at) =
retrieve_matches(ctx, &settings, trimmed_question).await?;
if matches.is_empty() {
return Ok(PreparedAiAnswer {
question: trimmed_question.to_string(),
provider_request: None,
immediate_answer: Some(
"我在当前博客资料里没有找到足够信息。你可以换个更具体的问题,或者先去后台重建一下 AI 索引。"
.to_string(),
),
sources: Vec::new(),
indexed_chunks,
last_indexed_at,
});
}
let sources = build_sources(&matches);
let provider_request = match (settings.api_base.clone(), settings.api_key.clone()) {
(Some(api_base), Some(api_key)) => Some(AiProviderRequest {
provider: settings.provider.clone(),
api_base,
api_key,
chat_model: settings.chat_model.clone(),
system_prompt: settings.system_prompt.clone(),
prompt: build_chat_prompt(trimmed_question, &matches),
}),
_ => None,
};
let immediate_answer = provider_request
.is_none()
.then(|| retrieval_only_answer(&matches));
Ok(PreparedAiAnswer {
question: trimmed_question.to_string(),
provider_request,
immediate_answer,
sources,
indexed_chunks,
last_indexed_at,
})
}
fn retrieval_only_answer(matches: &[ScoredChunk]) -> String {
let summary = matches
.iter()
.take(3)
.map(|item| {
let title = item
.row
.source_title
.as_deref()
.filter(|value| !value.trim().is_empty())
.unwrap_or("未命名内容");
let excerpt = item
.row
.content_preview
.clone()
.unwrap_or_else(|| preview_text(&item.row.content, 120).unwrap_or_default());
format!("{title}》: {excerpt}")
})
.collect::<Vec<_>>()
.join("\n");
format!(
"本地知识检索已经完成,但后台还没有配置聊天模型 API所以我先返回最相关的资料摘要\n{summary}\n\n\
如果你希望得到完整的自然语言回答,请在后台补上聊天模型的 API Base / API Key。"
)
}
async fn load_runtime_settings(
ctx: &AppContext,
require_enabled: bool,
) -> Result<AiRuntimeSettings> {
let raw = site_settings::Entity::find()
.order_by_asc(site_settings::Column::Id)
.one(&ctx.db)
.await?
.ok_or(Error::NotFound)?;
if require_enabled && !raw.ai_enabled.unwrap_or(false) {
return Err(Error::NotFound);
}
Ok(AiRuntimeSettings {
provider: provider_name(raw.ai_provider.as_deref()),
api_base: trim_to_option(raw.ai_api_base.clone()),
api_key: trim_to_option(raw.ai_api_key.clone()),
chat_model: trim_to_option(raw.ai_chat_model.clone())
.unwrap_or_else(|| DEFAULT_CHAT_MODEL.to_string()),
system_prompt: trim_to_option(raw.ai_system_prompt.clone())
.unwrap_or_else(|| DEFAULT_SYSTEM_PROMPT.to_string()),
top_k: raw
.ai_top_k
.map(|value| value.clamp(1, 12) as usize)
.unwrap_or(DEFAULT_TOP_K),
chunk_size: raw
.ai_chunk_size
.map(|value| value.clamp(400, 4000) as usize)
.unwrap_or(DEFAULT_CHUNK_SIZE),
raw,
})
}
async fn update_indexed_at(
ctx: &AppContext,
settings: &site_settings::Model,
) -> Result<DateTime<Utc>> {
let now = Utc::now();
let mut model = settings.clone().into_active_model();
model.ai_last_indexed_at = Set(Some(now.into()));
let _ = model.update(&ctx.db).await?;
Ok(now)
}
async fn retrieve_matches(
ctx: &AppContext,
settings: &AiRuntimeSettings,
question: &str,
) -> Result<(Vec<ScoredChunk>, usize, Option<DateTime<Utc>>)> {
let mut indexed_chunks = ai_chunks::Entity::find().count(&ctx.db).await? as usize;
let mut last_indexed_at = settings.raw.ai_last_indexed_at.map(Into::into);
if indexed_chunks == 0 {
let summary = rebuild_index(ctx).await?;
indexed_chunks = summary.indexed_chunks;
last_indexed_at = summary.last_indexed_at;
}
if indexed_chunks == 0 {
return Ok((Vec::new(), 0, last_indexed_at));
}
let question_embedding =
embed_texts_locally(vec![question.trim().to_string()], EmbeddingKind::Query)
.await?
.into_iter()
.next()
.unwrap_or_default();
let query_vector = vector_literal(&question_embedding)?;
let statement = Statement::from_sql_and_values(
DbBackend::Postgres,
r#"
SELECT
source_slug,
source_title,
chunk_index,
content,
content_preview,
word_count,
(1 - (embedding <=> $1::vector))::float8 AS score
FROM ai_chunks
WHERE embedding IS NOT NULL
ORDER BY embedding <=> $1::vector
LIMIT $2
"#,
[query_vector.into(), (settings.top_k as i64).into()],
);
let matches = SimilarChunkRow::find_by_statement(statement)
.all(&ctx.db)
.await?
.into_iter()
.map(|row| ScoredChunk {
score: row.score,
row: ai_chunks::Model {
created_at: Utc::now().into(),
updated_at: Utc::now().into(),
id: 0,
source_slug: row.source_slug,
source_title: row.source_title,
source_path: None,
source_type: "post".to_string(),
chunk_index: row.chunk_index,
content: row.content,
content_preview: row.content_preview,
embedding: None,
word_count: row.word_count,
},
})
.collect::<Vec<_>>();
Ok((matches, indexed_chunks, last_indexed_at))
}
pub async fn rebuild_index(ctx: &AppContext) -> Result<AiIndexSummary> {
let settings = load_runtime_settings(ctx, false).await?;
let posts = content::sync_markdown_posts(ctx).await?;
let chunk_drafts = build_chunks(&posts, settings.chunk_size);
let embeddings = if chunk_drafts.is_empty() {
Vec::new()
} else {
embed_texts_locally(
chunk_drafts
.iter()
.map(|chunk| chunk.content.clone())
.collect::<Vec<_>>(),
EmbeddingKind::Passage,
)
.await?
};
ctx.db
.execute(Statement::from_string(
DbBackend::Postgres,
"TRUNCATE TABLE ai_chunks RESTART IDENTITY".to_string(),
))
.await?;
for (draft, embedding) in chunk_drafts.iter().zip(embeddings.into_iter()) {
let embedding_literal = vector_literal(&embedding)?;
let statement = Statement::from_sql_and_values(
DbBackend::Postgres,
r#"
INSERT INTO ai_chunks (
source_slug,
source_title,
source_path,
source_type,
chunk_index,
content,
content_preview,
embedding,
word_count
) VALUES (
$1, $2, $3, $4, $5, $6, $7, $8::vector, $9
)
"#,
vec![
draft.source_slug.clone().into(),
draft.source_title.clone().into(),
draft.source_path.clone().into(),
draft.source_type.clone().into(),
draft.chunk_index.into(),
draft.content.clone().into(),
draft.content_preview.clone().into(),
embedding_literal.into(),
draft.word_count.into(),
],
);
ctx.db.execute(statement).await?;
}
let last_indexed_at = update_indexed_at(ctx, &settings.raw).await?;
Ok(AiIndexSummary {
indexed_chunks: chunk_drafts.len(),
last_indexed_at: Some(last_indexed_at),
})
}
pub async fn answer_question(ctx: &AppContext, question: &str) -> Result<AiAnswer> {
let prepared = prepare_answer(ctx, question).await?;
let answer = if let Some(immediate_answer) = prepared.immediate_answer.clone() {
immediate_answer
} else {
let request = prepared.provider_request.as_ref().ok_or_else(|| {
Error::BadRequest("AI provider request was not prepared".to_string())
})?;
request_chat_answer(request).await?
};
Ok(AiAnswer {
answer,
sources: prepared.sources,
indexed_chunks: prepared.indexed_chunks,
last_indexed_at: prepared.last_indexed_at,
})
}
pub fn provider_name(value: Option<&str>) -> String {
trim_to_option(value.map(ToString::to_string))
.unwrap_or_else(|| DEFAULT_AI_PROVIDER.to_string())
}
pub fn default_api_base() -> &'static str {
DEFAULT_AI_API_BASE
}
pub fn default_api_key() -> &'static str {
DEFAULT_AI_API_KEY
}
pub fn default_chat_model() -> &'static str {
DEFAULT_CHAT_MODEL
}
pub fn local_embedding_label() -> &'static str {
LOCAL_EMBEDDING_MODEL_LABEL
}

View File

@@ -1,13 +1,14 @@
use loco_rs::prelude::*;
use sea_orm::{
ActiveModelTrait, ColumnTrait, EntityTrait, IntoActiveModel, QueryFilter, QueryOrder, Set,
ActiveModelTrait, ColumnTrait, Condition, EntityTrait, IntoActiveModel, QueryFilter,
QueryOrder, Set,
};
use serde::{Deserialize, Serialize};
use serde_json::Value;
use std::fs;
use std::path::{Path, PathBuf};
use crate::models::_entities::{categories, posts, tags};
use crate::models::_entities::{categories, comments, posts, tags};
pub const MARKDOWN_POSTS_DIR: &str = "content/posts";
const FIXTURE_POSTS_FILE: &str = "src/fixtures/posts.yaml";
@@ -120,6 +121,19 @@ fn slugify(value: &str) -> String {
slug.trim_matches('-').to_string()
}
fn normalized_match_key(value: &str) -> String {
value.trim().to_lowercase()
}
fn same_text(left: &str, right: &str) -> bool {
normalized_match_key(left) == normalized_match_key(right)
}
fn text_matches_any(value: &str, keys: &[String]) -> bool {
let current = normalized_match_key(value);
!current.is_empty() && keys.iter().any(|key| current == *key)
}
fn excerpt_from_content(content: &str) -> Option<String> {
let mut in_code_block = false;
@@ -135,7 +149,11 @@ fn excerpt_from_content(content: &str) -> Option<String> {
}
let excerpt = trimmed.chars().take(180).collect::<String>();
return if excerpt.is_empty() { None } else { Some(excerpt) };
return if excerpt.is_empty() {
None
} else {
Some(excerpt)
};
}
None
@@ -188,7 +206,8 @@ fn parse_markdown_source(file_stem: &str, raw: &str, file_path: &str) -> Result<
let title = trim_to_option(frontmatter.title.clone())
.or_else(|| title_from_content(&content))
.unwrap_or_else(|| slug.clone());
let description = trim_to_option(frontmatter.description.clone()).or_else(|| excerpt_from_content(&content));
let description =
trim_to_option(frontmatter.description.clone()).or_else(|| excerpt_from_content(&content));
let category = trim_to_option(frontmatter.category.clone());
let tags = frontmatter
.tags
@@ -205,7 +224,8 @@ fn parse_markdown_source(file_stem: &str, raw: &str, file_path: &str) -> Result<
content: content.trim_start_matches('\n').to_string(),
category,
tags,
post_type: trim_to_option(frontmatter.post_type.clone()).unwrap_or_else(|| "article".to_string()),
post_type: trim_to_option(frontmatter.post_type.clone())
.unwrap_or_else(|| "article".to_string()),
image: trim_to_option(frontmatter.image.clone()),
pinned: frontmatter.pinned.unwrap_or(false),
published: frontmatter.published.unwrap_or(true),
@@ -216,7 +236,12 @@ fn parse_markdown_source(file_stem: &str, raw: &str, file_path: &str) -> Result<
fn build_markdown_document(post: &MarkdownPost) -> String {
let mut lines = vec![
"---".to_string(),
format!("title: {}", serde_yaml::to_string(&post.title).unwrap_or_else(|_| format!("{:?}", post.title)).trim()),
format!(
"title: {}",
serde_yaml::to_string(&post.title)
.unwrap_or_else(|_| format!("{:?}", post.title))
.trim()
),
format!("slug: {}", post.slug),
];
@@ -284,10 +309,16 @@ fn ensure_markdown_posts_bootstrapped() -> Result<()> {
image: None,
pinned: fixture.pinned.unwrap_or(false),
published: fixture.published.unwrap_or(true),
file_path: markdown_post_path(&fixture.slug).to_string_lossy().to_string(),
file_path: markdown_post_path(&fixture.slug)
.to_string_lossy()
.to_string(),
};
fs::write(markdown_post_path(&fixture.slug), build_markdown_document(&post)).map_err(io_error)?;
fs::write(
markdown_post_path(&fixture.slug),
build_markdown_document(&post),
)
.map_err(io_error)?;
}
Ok(())
@@ -312,14 +343,19 @@ async fn sync_tags_from_posts(ctx: &AppContext, posts: &[MarkdownPost]) -> Resul
for post in posts {
for tag_name in &post.tags {
let slug = slugify(tag_name);
let trimmed = tag_name.trim();
let existing = tags::Entity::find()
.filter(tags::Column::Slug.eq(&slug))
.filter(
Condition::any()
.add(tags::Column::Slug.eq(&slug))
.add(tags::Column::Name.eq(trimmed)),
)
.one(&ctx.db)
.await?;
if existing.is_none() {
let item = tags::ActiveModel {
name: Set(Some(tag_name.clone())),
name: Set(Some(trimmed.to_string())),
slug: Set(slug),
..Default::default()
};
@@ -339,12 +375,21 @@ async fn ensure_category(ctx: &AppContext, raw_name: &str) -> Result<Option<Stri
let slug = slugify(name);
let existing = categories::Entity::find()
.filter(categories::Column::Slug.eq(&slug))
.filter(
Condition::any()
.add(categories::Column::Slug.eq(&slug))
.add(categories::Column::Name.eq(name)),
)
.one(&ctx.db)
.await?;
if let Some(category) = existing {
if let Some(existing_name) = category.name.as_deref().map(str::trim).filter(|value| !value.is_empty()) {
if let Some(existing_name) = category
.name
.as_deref()
.map(str::trim)
.filter(|value| !value.is_empty())
{
return Ok(Some(existing_name.to_string()));
}
@@ -381,12 +426,21 @@ async fn canonicalize_tags(ctx: &AppContext, raw_tags: &[String]) -> Result<Vec<
}
let existing = tags::Entity::find()
.filter(tags::Column::Slug.eq(&slug))
.filter(
Condition::any()
.add(tags::Column::Slug.eq(&slug))
.add(tags::Column::Name.eq(trimmed)),
)
.one(&ctx.db)
.await?;
let canonical_name = if let Some(tag) = existing {
if let Some(existing_name) = tag.name.as_deref().map(str::trim).filter(|value| !value.is_empty()) {
if let Some(existing_name) = tag
.name
.as_deref()
.map(str::trim)
.filter(|value| !value.is_empty())
{
existing_name.to_string()
} else {
let mut tag_model = tag.into_active_model();
@@ -415,6 +469,132 @@ async fn canonicalize_tags(ctx: &AppContext, raw_tags: &[String]) -> Result<Vec<
Ok(canonical_tags)
}
fn write_markdown_post_to_disk(post: &MarkdownPost) -> Result<()> {
fs::write(markdown_post_path(&post.slug), build_markdown_document(post)).map_err(io_error)
}
pub fn rewrite_category_references(
current_name: Option<&str>,
current_slug: &str,
next_name: Option<&str>,
) -> Result<usize> {
ensure_markdown_posts_bootstrapped()?;
let mut match_keys = Vec::new();
if let Some(name) = current_name {
let normalized = normalized_match_key(name);
if !normalized.is_empty() {
match_keys.push(normalized);
}
}
let normalized_slug = normalized_match_key(current_slug);
if !normalized_slug.is_empty() {
match_keys.push(normalized_slug);
}
if match_keys.is_empty() {
return Ok(0);
}
let next_category = next_name
.map(str::trim)
.filter(|value| !value.is_empty())
.map(ToString::to_string);
let mut changed = 0_usize;
let mut posts = load_markdown_posts_from_disk()?;
for post in &mut posts {
let Some(category) = post.category.as_deref() else {
continue;
};
if !text_matches_any(category, &match_keys) {
continue;
}
match &next_category {
Some(updated_name) if same_text(category, updated_name) => {}
Some(updated_name) => {
post.category = Some(updated_name.clone());
write_markdown_post_to_disk(post)?;
changed += 1;
}
None => {
post.category = None;
write_markdown_post_to_disk(post)?;
changed += 1;
}
}
}
Ok(changed)
}
pub fn rewrite_tag_references(
current_name: Option<&str>,
current_slug: &str,
next_name: Option<&str>,
) -> Result<usize> {
ensure_markdown_posts_bootstrapped()?;
let mut match_keys = Vec::new();
if let Some(name) = current_name {
let normalized = normalized_match_key(name);
if !normalized.is_empty() {
match_keys.push(normalized);
}
}
let normalized_slug = normalized_match_key(current_slug);
if !normalized_slug.is_empty() {
match_keys.push(normalized_slug);
}
if match_keys.is_empty() {
return Ok(0);
}
let next_tag = next_name
.map(str::trim)
.filter(|value| !value.is_empty())
.map(ToString::to_string);
let mut changed = 0_usize;
let mut posts = load_markdown_posts_from_disk()?;
for post in &mut posts {
let mut updated_tags = Vec::new();
let mut seen = std::collections::HashSet::new();
let mut post_changed = false;
for tag in &post.tags {
if text_matches_any(tag, &match_keys) {
post_changed = true;
if let Some(next_tag_name) = &next_tag {
let normalized = normalized_match_key(next_tag_name);
if seen.insert(normalized) {
updated_tags.push(next_tag_name.clone());
}
}
continue;
}
let normalized = normalized_match_key(tag);
if seen.insert(normalized) {
updated_tags.push(tag.clone());
}
}
if post_changed {
post.tags = updated_tags;
write_markdown_post_to_disk(post)?;
changed += 1;
}
}
Ok(changed)
}
async fn dedupe_tags(ctx: &AppContext) -> Result<()> {
let existing_tags = tags::Entity::find()
.order_by_asc(tags::Column::Id)
@@ -425,10 +605,7 @@ async fn dedupe_tags(ctx: &AppContext) -> Result<()> {
for tag in existing_tags {
let key = if tag.slug.trim().is_empty() {
tag.name
.as_deref()
.map(slugify)
.unwrap_or_default()
tag.name.as_deref().map(slugify).unwrap_or_default()
} else {
slugify(&tag.slug)
};
@@ -453,11 +630,7 @@ async fn dedupe_categories(ctx: &AppContext) -> Result<()> {
for category in existing_categories {
let key = if category.slug.trim().is_empty() {
category
.name
.as_deref()
.map(slugify)
.unwrap_or_default()
category.name.as_deref().map(slugify).unwrap_or_default()
} else {
slugify(&category.slug)
};
@@ -474,6 +647,28 @@ async fn dedupe_categories(ctx: &AppContext) -> Result<()> {
pub async fn sync_markdown_posts(ctx: &AppContext) -> Result<Vec<MarkdownPost>> {
let markdown_posts = load_markdown_posts_from_disk()?;
let markdown_slugs = markdown_posts
.iter()
.map(|post| post.slug.clone())
.collect::<std::collections::HashSet<_>>();
let existing_posts = posts::Entity::find().all(&ctx.db).await?;
for stale_post in existing_posts
.into_iter()
.filter(|post| !markdown_slugs.contains(&post.slug))
{
let stale_slug = stale_post.slug.clone();
let related_comments = comments::Entity::find()
.filter(comments::Column::PostSlug.eq(&stale_slug))
.all(&ctx.db)
.await?;
for comment in related_comments {
let _ = comment.delete(&ctx.db).await;
}
let _ = stale_post.delete(&ctx.db).await;
}
for post in &markdown_posts {
let canonical_category = match post.category.as_deref() {
@@ -545,6 +740,18 @@ pub async fn write_markdown_document(
Ok(updated)
}
pub async fn delete_markdown_post(ctx: &AppContext, slug: &str) -> Result<()> {
ensure_markdown_posts_bootstrapped()?;
let path = markdown_post_path(slug);
if !path.exists() {
return Err(Error::NotFound);
}
fs::remove_file(&path).map_err(io_error)?;
sync_markdown_posts(ctx).await?;
Ok(())
}
pub async fn create_markdown_post(
ctx: &AppContext,
draft: MarkdownPostDraft,
@@ -594,9 +801,16 @@ pub async fn create_markdown_post(
file_path: markdown_post_path(&slug).to_string_lossy().to_string(),
};
fs::write(markdown_post_path(&slug), build_markdown_document(&post)).map_err(io_error)?;
let path = markdown_post_path(&slug);
if path.exists() {
return Err(Error::BadRequest(format!(
"markdown post already exists for slug: {slug}"
)));
}
fs::write(&path, build_markdown_document(&post)).map_err(io_error)?;
sync_markdown_posts(ctx).await?;
parse_markdown_post(&markdown_post_path(&slug))
parse_markdown_post(&path)
}
pub async fn import_markdown_documents(
@@ -635,7 +849,8 @@ pub async fn import_markdown_documents(
continue;
}
fs::write(markdown_post_path(&slug), normalize_newlines(&file.content)).map_err(io_error)?;
fs::write(markdown_post_path(&slug), normalize_newlines(&file.content))
.map_err(io_error)?;
imported_slugs.push(slug);
}

View File

@@ -1 +1,2 @@
pub mod ai;
pub mod content;