In this tutorial, we build an EverMem-style persistent agent OS. We combine short-term conversational context (STM) with long-term vector memory using FAISS so the agent can recall relevant past information before generating each response. Alongside semantic memory, we also store structured records in SQLite to persist metadata like timestamps, importance scores, and memory signals (preference, fact, task, decision). As we interact with the agent, we see it form new memories, retrieve the most relevant ones for the current query, and maintain consistent behavior across turns.
Check Best Price →
!pip -q install -U transformers sentence-transformers faiss-cpu accelerate
import os, time, json, math, sqlite3, hashlib
from dataclasses import dataclass
from typing import List, Dict, Any, Optional
import numpy as np
import faiss
import torch
from sentence_transformers import SentenceTransformer
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
def _now_ts():
return int(time.time())
def _sha(s: str) -> str:
return hashlib.sha256(s.encode("utf-8", errors="ignore")).hexdigest()[:16]
def _ensure_dir(p: str):
os.makedirs(p, exist_ok=True)
def _safe_clip(text: str, max_chars: int = 1800) -> str:
text = (text or "").strip()
if len(text) <= max_chars:
return text
return text[:max_chars].rstrip() + " …"
@dataclass
class MemoryItem:
mid: str
role: str
text: str
created_ts: int
importance: float
tokens_est: int
meta: Dict[str, Any]
We set up the full environment by installing the required libraries and importing all dependencies needed for memory, embeddings, generation, and persistence. We define utility helper functions for hashing, timestamps, safe clipping, and directory management to support a stable agent OS foundation. We also introduce the MemoryItem dataclass, which serves as the core structured unit representing each memory item stored in our system.
class EverMemAgentOS:
def __init__(
self,
workdir: str = "/content/evermem_agent_os",
db_name: str = "evermem.sqlite",
embedding_model: str = "sentence-transformers/all-MiniLM-L6-v2",
gen_model: str = "google/flan-t5-small",
stm_max_turns: int = 10,
ltm_topk: int = 6,
consolidate_every: int = 8,
consolidate_trigger_tokens: int = 1400,
compress_target_chars: int = 420,
seed: int = 7,
):
self.workdir = workdir
_ensure_dir(self.workdir)
self.db_path = os.path.join(self.workdir, db_name)
self.embedder = SentenceTransformer(embedding_model)
self.embed_dim = self.embedder.get_sentence_embedding_dimension()
self.tokenizer = AutoTokenizer.from_pretrained(gen_model)
self.model = AutoModelForSeq2SeqLM.from_pretrained(gen_model)
self.model.to(self.device)
self.model.eval()
self.stm_max_turns = stm_max_turns
self.ltm_topk = ltm_topk
self.consolidate_every = consolidate_every
self.consolidate_trigger_tokens = consolidate_trigger_tokens
self.compress_target_chars = compress_target_chars
np.random.seed(seed)
self._init_db()
self._init_faiss()
self.stm: List[Dict[str, str]] = []
self.turns = 0
def _init_db(self):
conn = sqlite3.connect(self.db_path)
cur = conn.cursor()
cur.execute(
"""
CREATE TABLE IF NOT EXISTS memories (
mid TEXT PRIMARY KEY,
role TEXT,
text TEXT,
created_ts INTEGER,
importance REAL,
tokens_est INTEGER,
meta_json TEXT
)
"""
)
cur.execute(
"""
CREATE TABLE IF NOT EXISTS kv_store (
k TEXT PRIMARY KEY,
v_json TEXT,
updated_ts INTEGER
)
"""
)
cur.execute(
"""
CREATE TABLE IF NOT EXISTS consolidations (
cid TEXT PRIMARY KEY,
created_ts INTEGER,
summary TEXT,
source_mids_json TEXT
)
"""
)
conn.commit()
conn.close()
def _init_faiss(self):
self.faiss_index_path = os.path.join(self.workdir, "faiss.index")
self.faiss_map_path = os.path.join(self.workdir, "faiss_map.json")
if os.path.exists(self.faiss_index_path) and os.path.exists(self.faiss_map_path):
self.index = faiss.read_index(self.faiss_index_path)
with open(self.faiss_map_path, "r", encoding="utf-8") as f:
self.id_map = json.load(f)
self.id_map = {int(k): v for k, v in self.id_map.items()}
self.next_faiss_id = (max(self.id_map.keys()) + 1) if self.id_map else 0
return
self.index = faiss.IndexFlatIP(self.embed_dim)
self.id_map: Dict[int, str] = {}
self.next_faiss_id = 0
self._persist_faiss()
def _persist_faiss(self):
faiss.write_index(self.index, self.faiss_index_path)
with open(self.faiss_map_path, "w", encoding="utf-8") as f:
json.dump({str(k): v for k, v in self.id_map.items()}, f)
def _embed(self, texts: List[str]) -> np.ndarray:
vecs = self.embedder.encode(texts, convert_to_numpy=True, normalize_embeddings=True)
if vecs.ndim == 1:
vecs = vecs.reshape(1, -1)
return vecs.astype("float32")
def _tokens_est(self, text: str) -> int:
text = text or ""
return max(1, int(len(text.split()) * 1.25))
def _importance_score(self, role: str, text: str, meta: Dict[str, Any]) -> float:
base = 0.35
length_bonus = min(0.45, math.log1p(len(text)) / 20.0)
role_bonus = 0.08 if role == "user" else 0.03
pin = 0.35 if meta.get("pinned") else 0.0
signal = meta.get("signal", "")
signal_bonus = 0.18 if signal in {"decision", "preference", "fact", "task"} else 0.0
q_bonus = 0.06 if "?" in text else 0.0
number_bonus = 0.05 if any(ch.isdigit() for ch in text) else 0.0
return float(min(1.0, base + length_bonus + role_bonus + pin + signal_bonus + q_bonus + number_bonus))
def upsert_kv(self, k: str, v: Any):
conn = sqlite3.connect(self.db_path)
cur = conn.cursor()
cur.execute(
"INSERT INTO kv_store (k, v_json, updated_ts) VALUES (?, ?, ?) ON CONFLICT(k) DO UPDATE SET v_json=excluded.v_json, updated_ts=excluded.updated_ts",
(k, json.dumps(v, ensure_ascii=False), _now_ts()),
)
conn.commit()
conn.close()
def get_kv(self, k: str, default=None):
conn = sqlite3.connect(self.db_path)
cur = conn.cursor()
cur.execute("SELECT v_json FROM kv_store WHERE k=?", (k,))
row = cur.fetchone()
conn.close()
if not row:
return default
try:
return json.loads(row[0])
except Exception:
return default
def add_memory(self, role: str, text: str, meta: Optional[Dict[str, Any]] = None) -> str:
meta = meta or {}
text = (text or "").strip()
mid = meta.get("mid") or f"m:{_sha(f'{_now_ts()}::{role}::{text[:80]}::{np.random.randint(0, 10**9)}')}"
created_ts = _now_ts()
tokens_est = self._tokens_est(text)
importance = float(meta.get("importance")) if meta.get("importance") is not None else self._importance_score(role, text, meta)
conn = sqlite3.connect(self.db_path)
cur = conn.cursor()
cur.execute(
"INSERT OR REPLACE INTO memories (mid, role, text, created_ts, importance, tokens_est, meta_json) VALUES (?, ?, ?, ?, ?, ?, ?)",
(mid, role, text, created_ts, importance, tokens_est, json.dumps(meta, ensure_ascii=False)),
)
conn.commit()
conn.close()
vec = self._embed([text])
fid = self.next_faiss_id
self.next_faiss_id += 1
self.index.add(vec)
self.id_map[fid] = mid
self._persist_faiss()
return mid
We initialize the EverMemAgentOS class and configure the embedding model, generation model, device selection, and memory hyperparameters. We create the SQLite schema for persistent storage and initialize the FAISS index for vector-based long-term memory retrieval. We also implement a memory-writing pipeline, including importance scoring and vector insertion, enabling the agent to store structured and semantic memory simultaneously.
def _fetch_memories_by_ids(self, mids: List[str]) -> List[MemoryItem]:
if not mids:
return []
placeholders = ",".join(["?"] * len(mids))
conn = sqlite3.connect(self.db_path)
cur = conn.cursor()
cur.execute(
f"SELECT mid, role, text, created_ts, importance, tokens_est, meta_json FROM memories WHERE mid IN ({placeholders})",
mids,
)
rows = cur.fetchall()
conn.close()
items = []
for r in rows:
meta = {}
try:
meta = json.loads(r[6]) if r[6] else {}
except Exception:
meta = {}
items.append(
MemoryItem(
mid=r[0],
role=r[1],
text=r[2],
created_ts=int(r[3]),
importance=float(r[4]),
tokens_est=int(r[5]),
meta=meta,
)
)
mid_pos = {m: i for i, m in enumerate(mids)}
items.sort(key=lambda x: mid_pos.get(x.mid, 10**9))
return items
def retrieve_ltm(self, query: str, topk: Optional[int] = None) -> List[MemoryItem]:
topk = topk or self.ltm_topk
qv = self._embed([query])
scores, ids = self.index.search(qv, topk + 8)
mids = []
for fid in ids[0].tolist():
if fid == -1:
continue
mid = self.id_map.get(int(fid))
if mid:
mids.append(mid)
mids = list(dict.fromkeys(mids))[:topk]
return self._fetch_memories_by_ids(mids)
def _format_stm(self) -> str:
turns = self.stm[-self.stm_max_turns:]
chunks = []
for t in turns:
chunks.append(f"{t['role'].upper()}: {t['content']}")
return "n".join(chunks).strip()
def _format_ltm(self, ltm_items: List[MemoryItem]) -> str:
if not ltm_items:
return ""
lines = []
for i, it in enumerate(ltm_items, 1):
ts_age = max(1, (_now_ts() - it.created_ts) // 3600)
imp = f"{it.importance:.2f}"
tag = it.meta.get("signal", "")
tag = f" | {tag}" if tag else ""
lines.append(f"[LTM {i}] (imp={imp}, age_h={ts_age}{tag}) {it.role}: {_safe_clip(it.text, 420)}")
return "n".join(lines).strip()
@torch.inference_mode()
def _gen(self, prompt: str, max_new_tokens: int = 180) -> str:
inputs = self.tokenizer(prompt, return_tensors="pt", truncation=True, max_length=1024).to(self.device)
out_ids = self.model.generate(
**inputs,
max_new_tokens=max_new_tokens,
do_sample=True,
temperature=0.6,
top_p=0.92,
num_beams=1,
)
out = self.tokenizer.decode(out_ids[0], skip_special_tokens=True)
return (out or "").strip()
def _compress_memories(self, items: List[MemoryItem], max_chars: int = 520) -> str:
raw = "n".join([f"- {it.role}: {it.text}" for it in items])
raw = _safe_clip(raw, 3500)
prompt = (
"Summarize the following notes into a compact memory that preserves decisions, preferences, facts, and tasks. "
f"Keep it under {max_chars} characters.nnNOTES:n{raw}nnCOMPACT MEMORY:"
)
summ = self._gen(prompt, max_new_tokens=170).strip()
if len(summ) > max_chars:
summ = summ[:max_chars].rstrip() + "…"
return summ
def consolidate(self) -> Optional[str]:
conn = sqlite3.connect(self.db_path)
cur = conn.cursor()
cur.execute("SELECT mid, role, text, created_ts, importance, tokens_est, meta_json FROM memories ORDER BY created_ts DESC LIMIT 160")
rows = cur.fetchall()
conn.close()
items = []
for r in rows:
try:
meta = json.loads(r[6]) if r[6] else {}
except Exception:
meta = {}
items.append(MemoryItem(r[0], r[1], r[2], int(r[3]), float(r[4]), int(r[5]), meta))
if not items:
return None
items_sorted = sorted(items, key=lambda x: (-(x.importance + 0.15 * (1.0 / (1.0 + (_now_ts() - x.created_ts) / 3600.0))), -x.created_ts))
picked = items_sorted[:18]
summary = self._compress_memories(picked, max_chars=520)
cid = f"c:{_sha(f'{_now_ts()}::{summary[:120]}::{np.random.randint(0, 10**9)}')}"
source_mids = [it.mid for it in picked]
conn = sqlite3.connect(self.db_path)
cur = conn.cursor()
cur.execute(
"INSERT OR REPLACE INTO consolidations (cid, created_ts, summary, source_mids_json) VALUES (?, ?, ?, ?)",
(cid, _now_ts(), summary, json.dumps(source_mids, ensure_ascii=False)),
)
conn.commit()
conn.close()
self.add_memory(
role="system",
text=f"Consolidated memory: {summary}",
meta={"signal": "consolidation", "pinned": True, "source_mids": source_mids, "cid": cid, "importance": 0.95},
)
return cid
We implement semantic retrieval and formatting logic that enables the agent to fetch relevant long-term memories before reasoning. We define how short-term memory and retrieved long-term memory are structured and how they are injected into prompts for contextual generation. We also implement memory compression and consolidation logic, allowing the agent to periodically summarize high-value memories into durable long-term summaries.
def _should_consolidate(self) -> bool:
if self.turns > 0 and self.turns % self.consolidate_every == 0:
return True
conn = sqlite3.connect(self.db_path)
cur = conn.cursor()
cur.execute("SELECT SUM(tokens_est) FROM memories")
s = cur.fetchone()[0]
conn.close()
s = int(s or 0)
return s >= self.consolidate_trigger_tokens
def chat(self, user_text: str, user_meta: Optional[Dict[str, Any]] = None, max_answer_tokens: int = 220) -> Dict[str, Any]:
user_meta = user_meta or {}
self.turns += 1
self.stm.append({"role": "user", "content": user_text})
self.stm = self.stm[-(self.stm_max_turns * 2):]
self.add_memory("user", user_text, meta=user_meta)
ltm = self.retrieve_ltm(user_text, topk=self.ltm_topk)
stm_block = self._format_stm()
ltm_block = self._format_ltm(ltm)
sys_rules = (
"You are an AI agent with persistent memory. Use retrieved long-term memories to stay consistent. "
"If a memory conflicts with the user, ask a short clarifying question. Keep answers practical."
)
prompt = (
f"{sys_rules}nn"
f"SHORT-TERM CONTEXT:n{_safe_clip(stm_block, 1800)}nn"
f"RETRIEVED LONG-TERM MEMORIES:n{ltm_block if ltm_block else '(none)'}nn"
f"USER REQUEST:n{user_text}nn"
f"ANSWER:"
)
answer = self._gen(prompt, max_new_tokens=max_answer_tokens)
self.stm.append({"role": "assistant", "content": answer})
self.stm = self.stm[-(self.stm_max_turns * 2):]
self.add_memory("assistant", answer, meta={"signal": "response"})
consolidation_id = None
if self._should_consolidate():
consolidation_id = self.consolidate()
return {
"answer": answer,
"retrieved_ltm": [
{"mid": it.mid, "role": it.role, "importance": it.importance, "meta": it.meta, "text": _safe_clip(it.text, 320)}
for it in ltm
],
"consolidation_id": consolidation_id,
}
def inspect_recent_memories(self, n: int = 12) -> List[Dict[str, Any]]:
conn = sqlite3.connect(self.db_path)
cur = conn.cursor()
cur.execute("SELECT mid, role, text, created_ts, importance, tokens_est, meta_json FROM memories ORDER BY created_ts DESC LIMIT ?", (n,))
rows = cur.fetchall()
conn.close()
out = []
for r in rows:
try:
meta = json.loads(r[6]) if r[6] else {}
except Exception:
meta = {}
out.append({"mid": r[0], "role": r[1], "created_ts": int(r[3]), "importance": float(r[4]), "tokens_est": int(r[5]), "meta": meta, "text": _safe_clip(r[2], 520)})
return out
def inspect_consolidations(self, n: int = 5) -> List[Dict[str, Any]]:
conn = sqlite3.connect(self.db_path)
cur = conn.cursor()
cur.execute("SELECT cid, created_ts, summary, source_mids_json FROM consolidations ORDER BY created_ts DESC LIMIT ?", (n,))
rows = cur.fetchall()
conn.close()
out = []
for r in rows:
try:
src = json.loads(r[3]) if r[3] else []
except Exception:
src = []
out.append({"cid": r[0], "created_ts": int(r[1]), "summary": r[2], "source_mids": src})
return out
We implement the agent’s main reasoning loop in the chat() function, combining STM, LTM retrieval, and generation into a single workflow. We ensure that every interaction updates both vector memory and structured memory while maintaining contextual coherence. We also include automatic consolidation triggers so the system behaves like a persistent memory OS rather than a simple chatbot.
agent = EverMemAgentOS()
agent.upsert_kv("profile", {"name": "User", "preferences": {"style": "concise"}})
demo_queries = [
("I prefer answers in bullet points and I’m working on a Colab tutorial.", {"signal": "preference", "pinned": True}),
("Remember that my project is about an EverMem-style agent OS with FAISS + SQLite.", {"signal": "fact", "pinned": True}),
("Give me a 5-step plan to add memory importance scoring and consolidation.", {"signal": "task"}),
("Now remind me what you know about my preferences and project, briefly.", {"signal": "task"}),
]
for q, meta in demo_queries:
r = agent.chat(q, user_meta=meta, max_answer_tokens=180)
print("nUSER:", q)
print("ASSISTANT:", r["answer"])
if r["retrieved_ltm"]:
print("RETRIEVED_LTM:", [(x["importance"], x["text"]) for x in r["retrieved_ltm"][:3]])
if r["consolidation_id"]:
print("CONSOLIDATED:", r["consolidation_id"])
print("nRECENT MEMORIES:")
for m in agent.inspect_recent_memories(10):
print(m["role"], m["importance"], m["text"])
print("nRECENT CONSOLIDATIONS:")
for c in agent.inspect_consolidations(3):
print(c["cid"], c["summary"])
We instantiate the agent and simulate multi-turn interactions to demonstrate persistent recall and memory usage. We observe how the agent retrieves relevant long-term memories and uses them to produce consistent responses. Finally, we inspect stored memories and consolidations to verify that our EverMem-style architecture actively manages and evolves its memory over time.
In conclusion, we have a working memory-centric agent that behaves less like a stateless chatbot and more like a persistent assistant that learns from interactions. We implemented importance scoring to prioritize what matters, vector retrieval to fetch the right context at the right time, and periodic consolidation to compress multiple memories into durable summaries that improve long-horizon recall. We also kept the system practical for Colab by using lightweight models, FAISS for fast similarity search, and SQLite for structured persistence.
Check out the Full Codes here. Also, feel free to follow us on Twitter and don’t forget to join our 120k+ ML SubReddit and Subscribe to our Newsletter. Wait! are you on telegram? now you can join us on telegram as well.
The post How to Build an EverMem-Style Persistent AI Agent OS with Hierarchical Memory, FAISS Vector Retrieval, SQLite Storage, and Automated Memory Consolidation appeared first on MarkTechPost.











