Skip to content

Commit b2baa38

Browse files
authoredFeb 6, 2025··
Report LLM token usage (#991)
* report token usage at end of codemodder run * move log
1 parent d615dd7 commit b2baa38

File tree

5 files changed

+78
-28
lines changed

5 files changed

+78
-28
lines changed
 

‎src/codemodder/codemodder.py

+22-15
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,7 @@
1414
from codemodder.codetf import CodeTF
1515
from codemodder.context import CodemodExecutionContext
1616
from codemodder.dependency import Dependency
17-
from codemodder.llm import MisconfiguredAIClient
17+
from codemodder.llm import MisconfiguredAIClient, TokenUsage, log_token_usage
1818
from codemodder.logging import configure_logger, log_list, log_section, logger
1919
from codemodder.project_analysis.file_parsers.package_store import PackageStore
2020
from codemodder.project_analysis.python_repo_manager import PythonRepoManager
@@ -46,7 +46,7 @@ def find_semgrep_results(
4646
return run_semgrep(context, yaml_files, files_to_analyze)
4747

4848

49-
def log_report(context, output, elapsed_ms, files_to_analyze):
49+
def log_report(context, output, elapsed_ms, files_to_analyze, token_usage):
5050
log_section("report")
5151
logger.info("scanned: %s files", len(files_to_analyze))
5252
all_failures = context.get_failed_files()
@@ -62,6 +62,7 @@ def log_report(context, output, elapsed_ms, files_to_analyze):
6262
len(set(all_changes)),
6363
)
6464
logger.info("report file: %s", output)
65+
log_token_usage("All", token_usage)
6566
logger.info("total elapsed: %s ms", elapsed_ms)
6667
logger.info(" semgrep: %s ms", context.timer.get_time_ms("semgrep"))
6768
logger.info(" parse: %s ms", context.timer.get_time_ms("parse"))
@@ -72,24 +73,30 @@ def log_report(context, output, elapsed_ms, files_to_analyze):
7273
def apply_codemods(
7374
context: CodemodExecutionContext,
7475
codemods_to_run: Sequence[BaseCodemod],
75-
):
76+
) -> TokenUsage:
7677
log_section("scanning")
78+
token_usage = TokenUsage()
7779

7880
if not context.files_to_analyze:
7981
logger.info("no files to scan")
80-
return
82+
return token_usage
8183

8284
if not codemods_to_run:
8385
logger.info("no codemods to run")
84-
return
86+
return token_usage
8587

8688
# run codemods one at a time making sure to respect the given sequence
8789
for codemod in codemods_to_run:
8890
# NOTE: this may be used as a progress indicator by upstream tools
8991
logger.info("running codemod %s", codemod.id)
90-
codemod.apply(context)
92+
codemod_token_usage = codemod.apply(context)
93+
if codemod_token_usage:
94+
log_token_usage(f"Codemod {codemod.id}", codemod_token_usage)
95+
token_usage += codemod_token_usage
96+
9197
record_dependency_update(context.process_dependencies(codemod.id))
9298
context.log_changes(codemod.id)
99+
return token_usage
93100

94101

95102
def record_dependency_update(dependency_results: dict[Dependency, PackageStore | None]):
@@ -128,7 +135,7 @@ def run(
128135
codemod_registry: registry.CodemodRegistry | None = None,
129136
sast_only: bool = False,
130137
ai_client: bool = True,
131-
) -> tuple[CodeTF | None, int]:
138+
) -> tuple[CodeTF | None, int, TokenUsage]:
132139
start = datetime.datetime.now()
133140

134141
codemod_registry = codemod_registry or registry.load_registered_codemods()
@@ -139,6 +146,7 @@ def run(
139146
codemod_exclude = codemod_exclude or []
140147

141148
provider_registry = providers.load_providers()
149+
token_usage = TokenUsage()
142150

143151
log_section("startup")
144152
logger.info("codemodder: python/%s", __version__)
@@ -148,7 +156,7 @@ def run(
148156
logger.error(
149157
f"FileNotFoundError: [Errno 2] No such file or directory: '{file_name}'"
150158
)
151-
return None, 1
159+
return None, 1, token_usage
152160

153161
repo_manager = PythonRepoManager(Path(directory))
154162

@@ -168,7 +176,8 @@ def run(
168176
)
169177
except MisconfiguredAIClient as e:
170178
logger.error(e)
171-
return None, 3 # Codemodder instructions conflicted (according to spec)
179+
# Codemodder instructions conflicted (according to spec)
180+
return None, 3, token_usage
172181

173182
context.repo_manager.parse_project()
174183

@@ -194,10 +203,7 @@ def run(
194203
context.find_and_fix_paths,
195204
)
196205

197-
apply_codemods(
198-
context,
199-
codemods_to_run,
200-
)
206+
token_usage = apply_codemods(context, codemods_to_run)
201207

202208
elapsed = datetime.datetime.now() - start
203209
elapsed_ms = int(elapsed.total_seconds() * 1000)
@@ -217,8 +223,9 @@ def run(
217223
output,
218224
elapsed_ms,
219225
[] if not codemods_to_run else context.files_to_analyze,
226+
token_usage,
220227
)
221-
return codetf, 0
228+
return codetf, 0, token_usage
222229

223230

224231
def _run_cli(original_args) -> int:
@@ -258,7 +265,7 @@ def _run_cli(original_args) -> int:
258265
logger.info("command: %s %s", Path(sys.argv[0]).name, " ".join(original_args))
259266
configure_logger(argv.verbose, argv.log_format, argv.project_name)
260267

261-
_, status = run(
268+
_, status, _ = run(
262269
argv.directory,
263270
argv.dry_run,
264271
argv.output,

‎src/codemodder/codemods/base_codemod.py

+11-9
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,7 @@
1717
from codemodder.codetf import DetectionTool, Reference
1818
from codemodder.context import CodemodExecutionContext
1919
from codemodder.file_context import FileContext
20+
from codemodder.llm import TokenUsage
2021
from codemodder.logging import logger
2122
from codemodder.result import ResultSet
2223

@@ -188,15 +189,15 @@ def _apply(
188189
self,
189190
context: CodemodExecutionContext,
190191
rules: list[str],
191-
) -> None:
192+
) -> None | TokenUsage:
192193
if self.provider and (
193194
not (provider := context.providers.get_provider(self.provider))
194195
or not provider.is_available
195196
):
196197
logger.warning(
197198
"provider %s is not available, skipping codemod", self.provider
198199
)
199-
return
200+
return None
200201

201202
if isinstance(self.detector, SemgrepRuleDetector):
202203
if (
@@ -208,7 +209,7 @@ def _apply(
208209
"no results from semgrep for %s, skipping analysis",
209210
self.id,
210211
)
211-
return
212+
return None
212213

213214
results: ResultSet | None = (
214215
# It seems like semgrep doesn't like our fully-specified id format so pass in short name instead.
@@ -219,11 +220,11 @@ def _apply(
219220

220221
if results is not None and not results:
221222
logger.debug("No results for %s", self.id)
222-
return
223+
return None
223224

224225
if not (files_to_analyze := self.get_files_to_analyze(context, results)):
225226
logger.debug("No files matched for %s", self.id)
226-
return
227+
return None
227228

228229
process_file = functools.partial(
229230
self._process_file, context=context, results=results, rules=rules
@@ -240,8 +241,9 @@ def _apply(
240241
executor.shutdown(wait=True)
241242

242243
context.process_results(self.id, contexts)
244+
return None
243245

244-
def apply(self, context: CodemodExecutionContext) -> None:
246+
def apply(self, context: CodemodExecutionContext) -> None | TokenUsage:
245247
"""
246248
Apply the codemod with the given codemod execution context
247249
@@ -257,7 +259,7 @@ def apply(self, context: CodemodExecutionContext) -> None:
257259
258260
:param context: The codemod execution context
259261
"""
260-
self._apply(context, [self._internal_name])
262+
return self._apply(context, [self._internal_name])
261263

262264
def _process_file(
263265
self,
@@ -355,8 +357,8 @@ def __init__(
355357
if requested_rules:
356358
self.requested_rules.extend(requested_rules)
357359

358-
def apply(self, context: CodemodExecutionContext) -> None:
359-
self._apply(context, self.requested_rules)
360+
def apply(self, context: CodemodExecutionContext) -> None | TokenUsage:
361+
return self._apply(context, self.requested_rules)
360362

361363
def get_files_to_analyze(
362364
self,

‎src/codemodder/llm.py

+29
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,11 @@
11
from __future__ import annotations
22

33
import os
4+
from dataclasses import dataclass
45
from typing import TYPE_CHECKING
56

7+
from typing_extensions import Self
8+
69
try:
710
from openai import AzureOpenAI, OpenAI
811
except ImportError:
@@ -28,6 +31,8 @@
2831
"setup_openai_llm_client",
2932
"setup_azure_llama_llm_client",
3033
"MisconfiguredAIClient",
34+
"TokenUsage",
35+
"log_token_usage",
3136
]
3237

3338
models = ["gpt-4-turbo-2024-04-09", "gpt-4o-2024-05-13", "gpt-35-turbo-0125"]
@@ -115,3 +120,27 @@ def setup_azure_llama_llm_client() -> ChatCompletionsClient | None:
115120

116121
class MisconfiguredAIClient(ValueError):
117122
pass
123+
124+
125+
@dataclass
126+
class TokenUsage:
127+
completion_tokens: int = 0
128+
prompt_tokens: int = 0
129+
130+
def __iadd__(self, other: Self) -> Self:
131+
self.completion_tokens += other.completion_tokens
132+
self.prompt_tokens += other.prompt_tokens
133+
return self
134+
135+
@property
136+
def total(self):
137+
return self.completion_tokens + self.prompt_tokens
138+
139+
140+
def log_token_usage(header: str, token_usage: TokenUsage):
141+
logger.info(
142+
"%s token usage\n\tcompletion_tokens = %s\n\tprompt_tokens = %s",
143+
header,
144+
token_usage.completion_tokens,
145+
token_usage.prompt_tokens,
146+
)

‎tests/test_codemodder.py

+7-3
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@
77
from codemodder import run
88
from codemodder.codemodder import _run_cli, find_semgrep_results
99
from codemodder.diff import create_diff_from_tree
10+
from codemodder.llm import TokenUsage
1011
from codemodder.registry import load_registered_codemods
1112
from codemodder.result import ResultSet
1213
from codemodder.semgrep import run as semgrep_run
@@ -30,7 +31,9 @@ def disable_codemod_apply(mocker, request):
3031
"test_run_codemod_name_or_id",
3132
):
3233
return
33-
mocker.patch("codemodder.codemods.base_codemod.BaseCodemod.apply")
34+
mocker.patch(
35+
"codemodder.codemods.base_codemod.BaseCodemod.apply", return_value=TokenUsage()
36+
)
3437

3538

3639
@pytest.fixture(scope="function")
@@ -395,7 +398,8 @@ class TestRun:
395398
def test_run_basic_call(self, mock_parse, dir_structure):
396399
code_dir, codetf = dir_structure
397400

398-
codetf_output, status = run(code_dir, dry_run=True)
401+
codetf_output, status, token_usage = run(code_dir, dry_run=True)
402+
assert token_usage.total == 0
399403
assert status == 0
400404
assert codetf_output
401405
assert codetf_output.run.directory == str(code_dir)
@@ -406,7 +410,7 @@ def test_run_basic_call(self, mock_parse, dir_structure):
406410
def test_run_with_output(self, mock_parse, dir_structure):
407411
code_dir, codetf = dir_structure
408412

409-
codetf_output, status = run(
413+
codetf_output, status, _ = run(
410414
code_dir,
411415
output=codetf,
412416
dry_run=True,

‎tests/test_llm.py

+9-1
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@
22

33
import pytest
44

5-
from codemodder.llm import MODELS, models
5+
from codemodder.llm import MODELS, TokenUsage, models
66

77

88
class TestModels:
@@ -20,3 +20,11 @@ def test_model_get_name_from_env(self, mocker, model):
2020
},
2121
)
2222
assert getattr(MODELS, attr_name) == name
23+
24+
25+
def test_token_usage():
26+
token_usage = TokenUsage()
27+
token_usage += TokenUsage(10, 5)
28+
assert token_usage.completion_tokens == 10
29+
assert token_usage.prompt_tokens == 5
30+
assert token_usage.total == 15

0 commit comments

Comments
 (0)
Please sign in to comment.