feat: Introduce K/V Context Quantisation (vRAM improvements) #4482
Workflow file for this run
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
name: test | |
concurrency: | |
# For PRs, later CI runs preempt previous ones. e.g. a force push on a PR | |
# cancels running CI jobs and starts all new ones. | |
# | |
# For non-PR pushes, concurrency.group needs to be unique for every distinct | |
# CI run we want to have happen. Use run_id, which in practice means all | |
# non-PR CI runs will be allowed to run without preempting each other. | |
group: ${{ github.workflow }}-$${{ github.pull_request.number || github.run_id }} | |
cancel-in-progress: true | |
on: | |
pull_request: | |
paths: | |
- '**/*' | |
- '!docs/**' | |
- '!README.md' | |
jobs: | |
changes: | |
runs-on: ubuntu-latest | |
outputs: | |
GENERATE: ${{ steps.changes.outputs.GENERATE }} | |
GENERATE_CUDA: ${{ steps.changes.outputs.GENERATE_CUDA }} | |
GENERATE_ROCM: ${{ steps.changes.outputs.GENERATE_ROCM }} | |
steps: | |
- uses: actions/checkout@v4 | |
with: | |
fetch-depth: 0 | |
- id: changes | |
run: | | |
changed() { | |
git diff-tree -r --no-commit-id --name-only \ | |
$(git merge-base ${{ github.event.pull_request.base.sha }} ${{ github.event.pull_request.head.sha }}) \ | |
${{ github.event.pull_request.head.sha }} \ | |
| xargs python3 -c "import sys; from pathlib import Path; print(any(Path(x).match(glob) for x in sys.argv[1:] for glob in '$*'.split(' ')))" | |
} | |
{ | |
echo GENERATE=$(changed 'llm/llama.cpp' 'llm/patches/**' 'llm/ext_server/**' 'llm/generate/**') | |
echo GENERATE_CUDA=$(changed 'llm/llama.cpp' 'llm/patches/**' 'llm/ext_server/**' 'llm/generate/**') | |
echo GENERATE_ROCM=$(changed 'llm/llama.cpp' 'llm/patches/**' 'llm/ext_server/**' 'llm/generate/**') | |
} >>$GITHUB_OUTPUT | |
generate: | |
needs: [changes] | |
if: ${{ needs.changes.outputs.GENERATE == 'True' }} | |
strategy: | |
matrix: | |
os: [ubuntu-latest, macos-latest, windows-2019] | |
arch: [amd64, arm64] | |
exclude: | |
- os: ubuntu-latest | |
arch: arm64 | |
- os: windows-2019 | |
arch: arm64 | |
runs-on: ${{ matrix.os }} | |
env: | |
GOARCH: ${{ matrix.arch }} | |
CGO_ENABLED: '1' | |
steps: | |
- uses: actions/checkout@v4 | |
- uses: actions/setup-go@v5 | |
with: | |
go-version-file: go.mod | |
cache: true | |
- run: go get ./... | |
- run: | | |
$gopath=(get-command go).source | split-path -parent | |
$gccpath=(get-command gcc).source | split-path -parent | |
& "C:\Program Files (x86)\Microsoft Visual Studio\2019\Enterprise\Common7\Tools\Launch-VsDevShell.ps1" | |
cd $env:GITHUB_WORKSPACE | |
$env:CMAKE_SYSTEM_VERSION="10.0.22621.0" | |
$env:PATH="$gopath;$gccpath;$env:PATH" | |
echo $env:PATH | |
go generate -x ./... | |
if: ${{ startsWith(matrix.os, 'windows-') }} | |
name: 'Windows Go Generate' | |
- run: go generate -x ./... | |
if: ${{ ! startsWith(matrix.os, 'windows-') }} | |
name: 'Unix Go Generate' | |
- run: go build . | |
generate-cuda: | |
needs: [changes] | |
if: ${{ needs.changes.outputs.GENERATE_CUDA == 'True' }} | |
strategy: | |
matrix: | |
cuda-version: | |
- '11.8.0' | |
runs-on: linux | |
container: nvidia/cuda:${{ matrix.cuda-version }}-devel-ubuntu20.04 | |
steps: | |
- run: | | |
apt-get update && apt-get install -y git build-essential curl | |
curl -fsSL https://github.com/Kitware/CMake/releases/download/v3.28.1/cmake-3.28.1-linux-x86_64.tar.gz \ | |
| tar -zx -C /usr --strip-components 1 | |
env: | |
DEBIAN_FRONTEND: noninteractive | |
- uses: actions/checkout@v4 | |
- uses: actions/setup-go@v4 | |
with: | |
go-version-file: go.mod | |
cache: true | |
- run: go get ./... | |
- run: | | |
git config --global --add safe.directory /__w/ollama/ollama | |
go generate -x ./... | |
env: | |
OLLAMA_SKIP_CPU_GENERATE: '1' | |
generate-rocm: | |
needs: [changes] | |
if: ${{ needs.changes.outputs.GENERATE_ROCM == 'True' }} | |
strategy: | |
matrix: | |
rocm-version: | |
- '6.1.2' | |
runs-on: linux | |
container: rocm/dev-ubuntu-20.04:${{ matrix.rocm-version }} | |
steps: | |
- run: | | |
apt-get update && apt-get install -y git build-essential curl rocm-libs | |
curl -fsSL https://github.com/Kitware/CMake/releases/download/v3.28.1/cmake-3.28.1-linux-x86_64.tar.gz \ | |
| tar -zx -C /usr --strip-components 1 | |
env: | |
DEBIAN_FRONTEND: noninteractive | |
- uses: actions/checkout@v4 | |
- uses: actions/setup-go@v4 | |
with: | |
go-version-file: go.mod | |
cache: true | |
- run: go get ./... | |
- run: | | |
git config --global --add safe.directory /__w/ollama/ollama | |
go generate -x ./... | |
env: | |
OLLAMA_SKIP_CPU_GENERATE: '1' | |
# ROCm generation step | |
generate-windows-rocm: | |
needs: [changes] | |
if: ${{ needs.changes.outputs.GENERATE_ROCM == 'True' }} | |
runs-on: windows | |
steps: | |
- uses: actions/checkout@v4 | |
- uses: actions/setup-go@v5 | |
with: | |
go-version-file: go.mod | |
cache: true | |
- name: 'Install ROCm' | |
run: | | |
$ErrorActionPreference = "Stop" | |
write-host "downloading AMD HIP Installer" | |
Invoke-WebRequest -Uri "https://download.amd.com/developer/eula/rocm-hub/AMD-Software-PRO-Edition-24.Q3-WinSvr2022-For-HIP.exe" -OutFile "${env:RUNNER_TEMP}\rocm-install.exe" | |
write-host "Installing AMD HIP" | |
Start-Process "${env:RUNNER_TEMP}\rocm-install.exe" -ArgumentList '-install' -NoNewWindow -Wait | |
write-host "Completed AMD HIP" | |
- name: 'Verify ROCm' | |
run: | | |
& 'C:\Program Files\AMD\ROCm\*\bin\clang.exe' --version | |
- run: go get ./... | |
- run: | | |
$gopath=(get-command go).source | split-path -parent | |
& "C:\Program Files (x86)\Microsoft Visual Studio\2019\Enterprise\Common7\Tools\Launch-VsDevShell.ps1" | |
cd $env:GITHUB_WORKSPACE | |
$env:CMAKE_SYSTEM_VERSION="10.0.22621.0" | |
$env:PATH="$gopath;$env:PATH" | |
$env:OLLAMA_SKIP_CPU_GENERATE="1" | |
$env:HIP_PATH=$(Resolve-Path 'C:\Program Files\AMD\ROCm\*\bin\clang.exe' | split-path | split-path) | |
go generate -x ./... | |
name: go generate | |
env: | |
OLLAMA_SKIP_CPU_GENERATE: '1' | |
# CUDA generation step | |
generate-windows-cuda: | |
needs: [changes] | |
if: ${{ needs.changes.outputs.GENERATE_CUDA == 'True' }} | |
runs-on: windows | |
steps: | |
- uses: actions/checkout@v4 | |
- uses: actions/setup-go@v5 | |
with: | |
go-version-file: go.mod | |
cache: true | |
- name: 'Install CUDA' | |
run: | | |
$ErrorActionPreference = "Stop" | |
write-host "downloading CUDA Installer" | |
Invoke-WebRequest -Uri "https://developer.download.nvidia.com/compute/cuda/11.3.1/local_installers/cuda_11.3.1_465.89_win10.exe" -OutFile "${env:RUNNER_TEMP}\cuda-install.exe" | |
write-host "Installing CUDA" | |
Start-Process "${env:RUNNER_TEMP}\cuda-install.exe" -ArgumentList '-s' -NoNewWindow -Wait | |
write-host "Completed CUDA" | |
$cudaPath=((resolve-path "c:\Program Files\NVIDIA*\CUDA\v*\bin\nvcc.exe")[0].path | split-path | split-path) | |
$cudaVer=($cudaPath | split-path -leaf ) -replace 'v(\d+).(\d+)', '$1_$2' | |
echo "$cudaPath\bin" >> $env:GITHUB_PATH | |
echo "CUDA_PATH=$cudaPath" >> $env:GITHUB_ENV | |
echo "CUDA_PATH_V${cudaVer}=$cudaPath" >> $env:GITHUB_ENV | |
echo "CUDA_PATH_VX_Y=CUDA_PATH_V${cudaVer}" >> $env:GITHUB_ENV | |
- name: 'Verify CUDA' | |
run: nvcc -V | |
- run: go get ./... | |
- name: go generate | |
run: | | |
$gopath=(get-command go).source | split-path -parent | |
$cudabin=(get-command nvcc).source | split-path | |
& "C:\Program Files (x86)\Microsoft Visual Studio\2019\Enterprise\Common7\Tools\Launch-VsDevShell.ps1" | |
cd $env:GITHUB_WORKSPACE | |
$env:CMAKE_SYSTEM_VERSION="10.0.22621.0" | |
$env:PATH="$gopath;$cudabin;$env:PATH" | |
$env:OLLAMA_SKIP_CPU_GENERATE="1" | |
go generate -x ./... | |
env: | |
OLLAMA_SKIP_CPU_GENERATE: '1' | |
lint: | |
strategy: | |
matrix: | |
os: [ubuntu-latest, macos-latest, windows-2019] | |
arch: [amd64, arm64] | |
exclude: | |
- os: ubuntu-latest | |
arch: arm64 | |
- os: windows-2019 | |
arch: arm64 | |
- os: macos-latest | |
arch: amd64 | |
runs-on: ${{ matrix.os }} | |
env: | |
GOARCH: ${{ matrix.arch }} | |
CGO_ENABLED: '1' | |
steps: | |
- uses: actions/checkout@v4 | |
with: | |
submodules: recursive | |
- uses: actions/setup-go@v5 | |
with: | |
go-version-file: go.mod | |
cache: false | |
- run: | | |
case ${{ matrix.arch }} in | |
amd64) echo ARCH=x86_64 ;; | |
arm64) echo ARCH=arm64 ;; | |
esac >>$GITHUB_ENV | |
shell: bash | |
- uses: golangci/golangci-lint-action@v6 | |
with: | |
args: --timeout 8m0s -v | |
test: | |
strategy: | |
matrix: | |
os: [ubuntu-latest, macos-latest, windows-2019] | |
arch: [amd64] | |
exclude: | |
- os: ubuntu-latest | |
arch: arm64 | |
- os: windows-2019 | |
arch: arm64 | |
runs-on: ${{ matrix.os }} | |
env: | |
GOARCH: ${{ matrix.arch }} | |
CGO_ENABLED: '1' | |
OLLAMA_CPU_TARGET: 'static' | |
OLLAMA_SKIP_CPU_GENERATE: '1' | |
OLLAMA_SKIP_METAL_GENERATE: '1' | |
steps: | |
- uses: actions/checkout@v4 | |
with: | |
submodules: recursive | |
- uses: actions/setup-go@v5 | |
with: | |
go-version-file: go.mod | |
cache: true | |
- run: | | |
case ${{ matrix.arch }} in | |
amd64) echo ARCH=amd64 ;; | |
arm64) echo ARCH=arm64 ;; | |
esac >>$GITHUB_ENV | |
shell: bash | |
- run: go generate ./... | |
- run: go build | |
- run: go test -v ./... |