-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathtest_ollama_qwen3.py
More file actions
141 lines (122 loc) Β· 4.66 KB
/
Copy pathtest_ollama_qwen3.py
File metadata and controls
141 lines (122 loc) Β· 4.66 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
"""
Test Qwen3 with Ollama (optimized GGUF quantization)
This should be MUCH faster than bitsandbytes on RTX 5090
"""
import subprocess
import sys
import time
import json
print("=" * 60)
print("Testing Qwen3 with Ollama")
print("=" * 60)
# Check if Ollama is installed
print("\nπ Checking Ollama installation...")
try:
result = subprocess.run(["ollama", "--version"], capture_output=True, text=True, check=True)
print(f"β
Ollama installed: {result.stdout.strip()}")
except FileNotFoundError:
print("β Ollama not installed!")
print("\nInstall with:")
print(" winget install Ollama.Ollama")
print(" OR download from: https://ollama.com/download")
sys.exit(1)
except Exception as e:
print(f"β Error checking Ollama: {e}")
sys.exit(1)
# Check if model is pulled
print("\nπ Checking for Qwen2.5:32b model...")
try:
result = subprocess.run(["ollama", "list"], capture_output=True, text=True, check=True)
if "qwen2.5:32b" in result.stdout.lower():
print("β
qwen2.5:32b model found")
else:
print("β οΈ Model not found. Pulling now (this will download ~20GB)...")
print(" This may take 10-30 minutes...")
subprocess.run(["ollama", "pull", "qwen2.5:32b"], check=True)
print("β
Model downloaded")
except Exception as e:
print(f"β Error: {e}")
sys.exit(1)
print("\n" + "=" * 60)
print("Testing inference speed...")
print("=" * 60)
# Medical entity extraction test
test_prompt = """Extract medical entities from this text and return as JSON:
"The patient presented with acute myocardial infarction and was administered aspirin 325mg and atorvastatin 80mg. Blood pressure was 140/90 mmHg."
Return JSON with this structure:
{
"entities": [
{"entity_name": "name", "entity_type": "type", "entity_description": "description"}
],
"relationships": [
{"source_entity_name": "source", "target_entity_name": "target", "relation_type": "type"}
]
}"""
print(f"\nπ Test prompt prepared")
print("\nπ§ͺ Generating response...")
# Run Ollama inference
start_time = time.time()
try:
result = subprocess.run(
["ollama", "run", "qwen2.5:32b", test_prompt],
capture_output=True,
text=True,
check=True
)
elapsed = time.time() - start_time
output = result.stdout.strip()
# Estimate tokens (rough: ~0.75 tokens per word)
words = len(output.split())
estimated_tokens = int(words * 0.75)
tokens_per_sec = estimated_tokens / elapsed if elapsed > 0 else 0
print(f"\nβ
Generation completed!")
print(f"\nπ Performance Metrics (Ollama GGUF Q4):")
print(f" Time: {elapsed:.2f}s")
print(f" Output words: {words}")
print(f" Estimated tokens: {estimated_tokens}")
print(f" Speed: ~{tokens_per_sec:.1f} tokens/sec")
if tokens_per_sec > 60:
print(" β
EXCELLENT! Much faster than bitsandbytes!")
print(" β Use Ollama for the pipeline")
elif tokens_per_sec > 30:
print(" β
Good speed, better than bitsandbytes (8.6 tok/s)")
print(" β Ollama is viable")
elif tokens_per_sec > 15:
print(" β οΈ Moderate speed, slight improvement")
print(" β Consider Qwen3-14B FP16 instead")
else:
print(" β Still slow")
print(" β Fallback to Qwen3-14B FP16 recommended")
print(f"\nπ€ Generated Response:")
print("-" * 60)
print(output[:800])
if len(output) > 800:
print("...(truncated)")
print("-" * 60)
print("\n" + "=" * 60)
print("Comparison:")
print("=" * 60)
print(f"bitsandbytes 4-bit: 8.6 tok/s (previous test)")
print(f"Ollama GGUF Q4: ~{tokens_per_sec:.1f} tok/s (this test)")
print(f"Speedup: {tokens_per_sec / 8.6:.1f}x")
if tokens_per_sec > 30:
print("\nβ
RECOMMENDATION: Use Ollama for the pipeline!")
print(" - Integrate Ollama into run_pipeline.py")
print(" - Expected pipeline time: ~2-4 min/doc (vs 12-15 min with Claude)")
else:
print("\nβ οΈ RECOMMENDATION: Switch to Qwen3-14B FP16")
print(" - Should get 80-100 tok/s")
print(" - Only 2% quality drop vs 32B")
except subprocess.CalledProcessError as e:
print(f"β Error running Ollama: {e}")
print(f" stdout: {e.stdout}")
print(f" stderr: {e.stderr}")
sys.exit(1)
print("\nπ‘ Next steps:")
if tokens_per_sec > 30:
print(" 1. Integrate Ollama into pipeline")
print(" 2. Compare quality vs Claude on 10 test chunks")
print(" 3. If quality is acceptable, use Ollama")
else:
print(" 1. Test Qwen3-14B FP16 (should be much faster)")
print(" 2. If still slow, try Qwen3-8B FP16")