feat: 对接 ollama

wwbweibo · wwbweibo · commit 6b23d99b16de · 2024-12-26T12:27:01.000+08:00
diff --git a/.gitignore b/.gitignore
@@ -11,4 +11,5 @@ data/
 .DS_Store
 *.pt
 *.pth
-*.log
+*.log
+*.lock
diff --git a/README.md b/README.md
@@ -2,6 +2,32 @@
 
 This project aims using AI to simplify the file management on NAS.
 
+## TODO List
+
+- File System Base:
+  - [x] File Browser
+  - [ ] File Manual Tag 
+  - [ ] FileIndex create and update at realtime 
+  - [ ] FileSystem event watch
+  - [ ] File encryption at write
+  - [ ] Multi NAS Support
+- Image Files:
+  - [X] Image Browser
+  - [X] Image Snapshot
+  - [X] Image Caption Using Local Vision Model
+  - [X] Image Auto Tag
+  - [ ] Image caption and tag using LLM
+  - [ ] Image search by tag and caption
+  - [ ] Image search by similar
+  - [ ] RAW Image Support
+- Video Files: 
+  - [ ] Video Player
+  - [ ] Video Caption
+- Document Files:
+  - [ ] Support Document preview and edit
+  - [ ] Using RAG to build knowledge base
+  - [ ] Document search using vec
+
 ### extra things when use this project
 
 decord install failed when install LAVIS:
diff --git a/file-server-dl/config.py b/file-server-dl/config.py
@@ -0,0 +1,17 @@
+from infra.ollama import Config as OllamaConfig
+import yaml
+
+class Config:
+    def __init__(self):
+        self.ollama = OllamaConfig()
+        self.nas_root_path = ""
+    def from_yaml_file(self, file_path: str):
+        with open(file_path, 'r') as f:
+            config = yaml.safe_load(f)
+            self.ollama = OllamaConfig()
+            self.ollama.enabled = config['ollama']['enabled']
+            self.ollama.model = config['ollama']['model']
+            self.ollama.host = config['ollama']['host']
+            self.ollama.port = config['ollama']['port']
+            self.nas_root_path = config['nas_root_path']
+        return self
diff --git a/file-server-dl/config.yaml b/file-server-dl/config.yaml
@@ -1 +1,7 @@
-nas_root_path: /Users/weibo/code/myself/anfm/tests
+nas_root_path: /Users/weibo/code/myself/anfm/tests
+ollama:
+  enabled: true
+  host: localhost 
+  port: 11434
+  model: llama3.2-vision
+  
diff --git a/file-server-dl/infra/ollama.py b/file-server-dl/infra/ollama.py
@@ -0,0 +1,31 @@
+import requests
+import json
+
+class Config:
+    enabled: bool
+    model: str
+    host: str
+    port: int
+
+class OllamaClient: 
+    def __init__(self, config: Config):
+        self.config = config
+
+    def request_ollama_generate(self, body: str, model: str = None, image: list[str] = None, format: dict = None) -> dict:
+        '''
+        请求ollama生成文本，返回生成的文本 
+        '''
+        req_body = {
+            'prompt': body,
+            'stream': False
+        }
+        if model:
+            req_body['model'] = model
+        else:
+            req_body['model'] = self.config.model
+        if image:
+            req_body['image'] = image
+        if format:
+            req_body['format'] = format
+        resp = requests.post(f'http://{self.config.host}:{self.config.port}/api/generate', json=req_body)
+        return json.loads(resp.json()['response'])
diff --git a/file-server-dl/main.py b/file-server-dl/main.py
@@ -1,16 +1,15 @@
 from magika import Magika
 from flask import Flask, request
 from service.file_understanding import FileUnderstanding
+from config import Config
 import logging
 import yaml
 
-logging.basicConfig(level=logging.DEBUG)
-global config
-
-# read config from config.yaml file then parse into config object
-with open("config.yaml", "r") as file:
-    config = yaml.safe_load(file)
-    
+logging.basicConfig(level=logging.INFO)
+config = Config().from_yaml_file("config.yaml")
+print("===============")
+print(config.__dict__)
+print(config.ollama.__dict__)
 fileUnderstanding = FileUnderstanding(config=config)
 logging.info("File Understanding Service Started")
 
diff --git a/file-server-dl/service/file_understanding.py b/file-server-dl/service/file_understanding.py
@@ -2,19 +2,21 @@
 from pathlib import Path
 from models.file import FileUnderstandingResult
 from service.image_understanding import ImageUnderstanding
+from config import Config
 import logging
 
+
 class FileUnderstanding:
-    def __init__(self, config: any):
+    def __init__(self, config: Config):
         self.magika = Magika()
-        self.image_understanding = ImageUnderstanding()
+        self.image_understanding = ImageUnderstanding(config)
         self.config = config
 
     def understand(self, path: str) -> FileUnderstandingResult:
-        result = self.magika.identify_path(Path(self.config['nas_root_path']  + path))
+        result = self.magika.identify_path(Path(self.config.nas_root_path + path))
         file_understanding = FileUnderstandingResult(result.output.ct_label, result.output.group, result.output.description)
         logging.info("File Understanding: %s", file_understanding)
         if file_understanding.group == 'image':
-            file_understanding.set_ext(self.image_understanding.understand( self.config['nas_root_path'] + path))
+            file_understanding.set_ext(self.image_understanding.understand( self.config.nas_root_path + path))
         logging.info("File Understanding Result: %s", file_understanding)
         return file_understanding
diff --git a/file-server-dl/service/image_understanding.py b/file-server-dl/service/image_understanding.py
@@ -1,5 +1,7 @@
 import logging
 import torch
+import base64
+from io import BytesIO
 import cn_clip.clip as clip
 from cn_clip.clip import load_from_name
 from lavis.models import load_model_and_preprocess
@@ -9,24 +11,29 @@
 import numpy as np
 from transformers import AutoModel, AutoImageProcessor
 from infra.milvus import conn as milvus_conn
+from infra.ollama import OllamaClient, Config as OllamaConfig
+from config import Config
 
 class ImageUnderstanding:
-    def __init__(self):
+    def __init__(self, config: Config):
         # check if torch_directml is available
         if importutil.find_spec("torch_directml") is None:
             self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
         else:
             import torch_directml
             self.device = torch_directml.device()
+        self.config = config
         self.clip_model = None
         self.clip_preprocess = None
         self.text_labels = None
         self.text_feature = None
         self.caption_model = None
         self.caption_vis_processors = None
         self.milvus_conn = milvus_conn
-        self.__init_clip_model__()
-        self.__init_caption_model__()
+        if not config.ollama.enabled:
+            # if ollama is not enabled, init local clip and caption model
+            self.__init_clip_model__()
+            self.__init_caption_model__()
         self.__init_embedding_model__()
 
     def __init_clip_model__(self):
@@ -88,20 +95,84 @@ def image_embedding(self, path: str) -> np.ndarray:
         inputs = self.embedding_processor(image, return_tensors="pt").to(self.device)
         outputs = self.embedding_model(**inputs)
         embedding = outputs.pooler_output.cpu().detach().numpy().flatten()
+        return embedding
 
-    def understand(self, path: str) -> ImageUnderstandingResult:
+    def image_understand_with_local_model(self, path: str):
         labels = self.label_image(path)
         logging.info("Image Labels: %s", labels)
         caption = self.caption_image(path)
         logging.info("Image Caption: %s", caption)
+        return labels, caption
+    
+    def image_understand_with_ollama(self, path: str):
+        image = Image.open(path)
+        # resize long side to 1024
+        width, height = image.size
+        if width > height:
+            if width > 1024:
+                height = int(1024 * height / width)
+                width = 1024
+        if width <= height:
+            if height > 1024:
+                width = int(1024 * width / height)
+                height = 1024
+        image = image.resize((width, height))
+        # 写入到byte数组
+        buffered = BytesIO()
+        image.save(buffered, format="JPEG")
+        bts = buffered.getvalue()
+        b64str = base64.b64encode(bts).decode("utf-8")
+        prompt = '''You are an experienced art critic and photographer who specializes in evaluating works of art using simple and beautiful language.
+Now, please use a short paragraph to describe the content of the picture you saw, and use this paragraph as the 'caption' in your answer.
+After that, you are asked to give 3-5 words that summarize the image in a high level and are used to label the image you saw, these words will be used as 'tags' in your answer.
+Finally, you will need to rate the image from four perspectives: 'Composition', 'Light and Shadow', 'Color' and 'Idea of the Work'. You need to rate the image from four perspectives: 'composition', 'light and shadow', 'color' and 'ideas', and give a final score of 0-10 on a scale of 0.1. The four scores and the overall rating will be used together as the 'scores' in your answer, and you will also be given a reason for why you gave the scores you gave from the four perspectives mentioned above, which will be used as the 'reason' for your answer.
+Your answer needs to use the json format as a return, if you are not sure what you are seeing, please just return the empty Json object, e.g. '{}'. The content in your answer MUST be in 'Chinese', including 'caption', 'tags' and 'reason', any Non-Chinese answer will be considered as an invalid answer.
+Your answer should not contain any subjective personal pronouns, e.g. 'I', 'we' etc. When you think you need to use them, please use words such as 'audience', 'others' etc. instead.'''
+        format = {
+        "type": "object",
+        "properties": {
+            "caption": {
+                "type": "string"
+            },
+            "tags": {
+                "type": "array",
+                "items": {
+                    "type": "string"
+                }
+            },
+            "score": {
+                "type": "array",
+                "items": {
+                    "type": "number",
+                    "minimum": 0,
+                    "maximum": 10
+                }
+            },
+            "reason": {
+                "type": "string"
+            }
+        },
+        "required": ["caption", "tags", "score", "reason"]
+    }
+        result = OllamaClient(config=self.config.ollama).request_ollama_generate(body=prompt, image=[b64str], format=format)
+        # 由ollama输出的标签没有置信度
+        return [ImageLabel(x, 0.0) for x in result['tags']], result['caption']
+
+    def understand(self, path: str) -> ImageUnderstandingResult:
+        if self.config.ollama.enabled: 
+            # using ollama
+            labels, caption = self.image_understand_with_ollama(path)
+        else:
+            # using local model
+            labels, caption = self.image_understand_with_local_model(path)
         embedding = self.image_embedding(path)
         self.milvus_conn.insert(embedding, path)
-        logging.info("Image Embedding: %s", embedding)
         return ImageUnderstandingResult(labels, caption)
     
     def image_similarity(self, path: str) -> list[dict]:
         embedding = self.image_embedding(path)
         records = self.milvus_conn.search_by_vec(embedding)
         results = []
         for record in records:
-            results.append({"path": record.id, "score": record.distance})
+            results.append({"path": record.id, "score": record.distance})
+        return results
diff --git a/file-server/internal/domain/file/tree.go b/file-server/internal/domain/file/tree.go
@@ -11,6 +11,7 @@ func init() {
 	loadTree()
 }
 
+// TODO: refactor into sqlite.
 var Root = &DirNode{
 	Name:     "/",
 	Path:     "/",
diff --git a/file-server/internal/tasks/file_process_task.go b/file-server/internal/tasks/file_process_task.go
@@ -10,7 +10,6 @@ import (
 	"fileserver/utils"
 	"log"
 	"strings"
-	"sync"
 	"time"
 )
 
@@ -55,6 +54,7 @@ func (t *FileProcessTaskHandler) Start(ctx context.Context) error {
 		case task := <-t.taskChan:
 			_task := (task).(*entity.FileProcessTask)
 			t.singleFileHandler(ctx, _task.File)
+			log.Default().Printf("file process task: %s complete", _task.File)
 		}
 	}
 }
@@ -89,26 +89,29 @@ func (t *FileProcessTaskHandler) singleFileHandler(ctx context.Context, file str
 		}
 	}
 
-	wg := sync.WaitGroup{}
-	wg.Add(1)
-	go func() {
-		defer wg.Done()
-		result, err := dl.NewClient(t.config.DLConfiguration).Understanding(ctx, dl.UnderstandingRequest{
-			Path: file,
-		})
-		if err != nil {
-			log.Default().Printf("error getting file type: %v", err)
-			return
-		}
-		_file.SetFileTypeFromUnderstanding(result)
-	}()
+	// wg := sync.WaitGroup{}
+	// wg.Add(1)
+	// go func() {
+	// defer wg.Done()
+	result, err := dl.NewClient(t.config.DLConfiguration).Understanding(ctx, dl.UnderstandingRequest{
+		Path: file,
+	})
+	if err != nil {
+		log.Default().Printf("error getting file type: %v", err)
+	} else {
+		log.Default().Printf("file understand result: %v", result)
+	}
+	_file.SetFileTypeFromUnderstanding(result)
+	// }()
 
 	// insert into database
-	wg.Wait()
+	// wg.Wait()
 	if _file.Group == "image" {
+		log.Default().Printf("send image compression task: %s", file)
 		bus.Send(&entity.ImageCompressionTask{File: _file})
 	}
-	err := t.repo.CreateOrUpdateFile(ctx, _file)
+	log.Default().Printf("insert file %s", file)
+	err = t.repo.CreateOrUpdateFile(ctx, _file)
 	if err != nil {
 		log.Default().Printf("error inserting file %s: %v", file, err)
 	}
diff --git a/file-server/internal/tasks/task_bus.go b/file-server/internal/tasks/task_bus.go
@@ -9,33 +9,32 @@ var bus *TaskBus
 
 func init() {
 	bus = NewTaskBus()
-	go bus.TaskHandleLoop()
 	log.Default().Println("task bus init")
 }
 
 // TaskBus 任务总线, 用于任务之间的通信
 type TaskBus struct {
-	bus      chan server.ITask
+	buses    map[string]chan server.ITask
 	handlers map[string]server.BackendTaskHandler
 }
 
 func NewTaskBus() *TaskBus {
 	return &TaskBus{
-		bus:      make(chan server.ITask),
+		buses:    make(map[string]chan server.ITask),
 		handlers: make(map[string]server.BackendTaskHandler),
 	}
 }
 
 func (b *TaskBus) Send(task server.ITask) {
-	b.bus <- task
+	b.buses[task.GetTaskName()] <- task
 }
 
 func (b *TaskBus) RegisterHandler(task server.BackendTaskHandler) {
 	b.handlers[task.GetTaskName()] = task
-}
-
-func (b *TaskBus) TaskHandleLoop() {
-	for task := range b.bus {
-		b.handlers[task.GetTaskName()].Append(task)
-	}
+	b.buses[task.GetTaskName()] = make(chan server.ITask, 100)
+	go func() {
+		for _task := range b.buses[task.GetTaskName()] {
+			task.Append(_task)
+		}
+	}()
 }

-Original file line number
+Diff line change
 .DS_Store
 *.pt
 *.pth
 -*.log
 +*.log
 +*.lock
Original file line number	Diff line number	Diff line change
`@@ -11,6 +11,7 @@ func init() {`
`11`	`11`	`loadTree()`
`12`	`12`	`}`
`13`	`13`
	`14`	`+// TODO: refactor into sqlite.`
`14`	`15`	`var Root = &DirNode{`
`15`	`16`	`Name: "/",`
`16`	`17`	`Path: "/",`