Skip to content

Commit dbaaedf

Browse files
KristofferCKristofferC
authored andcommitted
compress the version map a bit
we use a compression where for a given stdlib with given dependencies we associate that with a julia range. Within that julia range we also store how the version of the stdlib itself has evolved. The compression and uncompression code were written by an LLM after my instruction of how it should be compressed.
1 parent 047f9f7 commit dbaaedf

File tree

5 files changed

+1882
-15362
lines changed

5 files changed

+1882
-15362
lines changed

ext/HistoricalStdlibGenerator/generate_historical_stdlibs.jl

Lines changed: 236 additions & 30 deletions
Original file line numberDiff line numberDiff line change
@@ -256,48 +256,254 @@ unregistered_stdlibs = filter(all_stdlibs) do (uuid, _)
256256
return !any(haskey(reg.pkgs, uuid) for reg in registries)
257257
end
258258

259-
# Helper function for getting these printed out in a nicely-sorted order
260-
function print_sorted(io::IO, d::Dict; indent::Int=0)
261-
println(io, "Dict{UUID,StdlibInfo}(")
262-
for (uuid, (name, version, deps, weakdeps)) in sort(collect(d), by = kv-> kv[2][1])
263-
println(io,
264-
" "^indent,
265-
repr(uuid), " => StdlibInfo(\n",
266-
" "^(indent + 4), repr(name), ",\n",
267-
" "^(indent + 4), repr(uuid), ",\n",
268-
" "^(indent + 4), repr(version), ",\n",
269-
" "^(indent + 4), repr(sort(deps)), ",\n",
270-
" "^(indent + 4), repr(sort(weakdeps)), ",\n",
271-
" "^indent, "),",
272-
)
259+
# Analyze UUID usage for constants
260+
function analyze_uuid_usage(stdlibs_by_version)
261+
uuid_counts = Dict{UUID, Int}()
262+
uuid_to_name = Dict{UUID, String}()
263+
264+
for (_, stdlib_dict) in stdlibs_by_version
265+
for (uuid, info) in stdlib_dict
266+
uuid_counts[uuid] = get(uuid_counts, uuid, 0) + 1
267+
uuid_to_name[uuid] = info.name
268+
269+
for dep_uuid in info.deps
270+
uuid_counts[dep_uuid] = get(uuid_counts, dep_uuid, 0) + 1
271+
end
272+
273+
for weakdep_uuid in info.weakdeps
274+
uuid_counts[weakdep_uuid] = get(uuid_counts, weakdep_uuid, 0) + 1
275+
end
276+
end
277+
end
278+
279+
return uuid_counts, uuid_to_name
280+
end
281+
282+
# Group by UUID
283+
function group_by_uuid(stdlibs_by_version)
284+
uuid_to_versions = Dict{UUID, Vector{Tuple{VersionNumber, StdlibInfo}}}()
285+
286+
for (version, stdlib_dict) in stdlibs_by_version
287+
for (uuid, info) in stdlib_dict
288+
if !haskey(uuid_to_versions, uuid)
289+
uuid_to_versions[uuid] = []
290+
end
291+
push!(uuid_to_versions[uuid], (version, info))
292+
end
293+
end
294+
295+
for (uuid, versions) in uuid_to_versions
296+
sort!(versions, by=x->x[1])
297+
end
298+
299+
return uuid_to_versions
300+
end
301+
302+
# Compare base info (everything except version)
303+
function base_info_equal(a::StdlibInfo, b::StdlibInfo)
304+
a.name == b.name &&
305+
a.uuid == b.uuid &&
306+
a.deps == b.deps &&
307+
a.weakdeps == b.weakdeps
308+
end
309+
310+
# Find segments where base info is constant
311+
function find_base_info_segments(versions_and_infos)
312+
segments = []
313+
i = 1
314+
315+
while i <= length(versions_and_infos)
316+
segment_start_idx = i
317+
segment_base_info = versions_and_infos[i][2]
318+
j = i + 1
319+
320+
# Find consecutive entries with same base info
321+
while j <= length(versions_and_infos)
322+
if base_info_equal(versions_and_infos[j][2], segment_base_info)
323+
j += 1
324+
else
325+
break
326+
end
327+
end
328+
329+
segment_end_idx = j - 1
330+
segment_entries = versions_and_infos[segment_start_idx:segment_end_idx]
331+
332+
# Within this segment, find version ranges based on package version
333+
version_ranges = []
334+
k = 1
335+
while k <= length(segment_entries)
336+
range_start_version = segment_entries[k][1]
337+
current_pkg_version = segment_entries[k][2].version
338+
m = k + 1
339+
340+
while m <= length(segment_entries)
341+
if segment_entries[m][2].version == current_pkg_version
342+
m += 1
343+
else
344+
break
345+
end
346+
end
347+
348+
range_end_version = segment_entries[m-1][1]
349+
push!(version_ranges, ((range_start_version, range_end_version), current_pkg_version))
350+
k = m
351+
end
352+
353+
push!(segments, (segment_base_info, version_ranges))
354+
i = j
355+
end
356+
357+
return segments
358+
end
359+
360+
# Analyze stdlib patterns with segmentation
361+
function analyze_stdlib_patterns(uuid_to_versions)
362+
uuid_to_segments = Dict{UUID, Vector}()
363+
364+
for (uuid, versions_and_infos) in uuid_to_versions
365+
segments = find_base_info_segments(versions_and_infos)
366+
uuid_to_segments[uuid] = segments
367+
end
368+
369+
return uuid_to_segments
370+
end
371+
372+
# Format UUID reference
373+
function format_uuid(uuid, uuid_constants)
374+
get(uuid_constants, uuid, "UUID(\"$(uuid)\")")
375+
end
376+
377+
# Format base info tuple
378+
function format_base_info(info, uuid_constants)
379+
deps_str = if isempty(info.deps)
380+
"UUID[]"
381+
else
382+
"UUID[" * join([format_uuid(d, uuid_constants) for d in info.deps], ", ") * "]"
383+
end
384+
385+
weakdeps_str = if isempty(info.weakdeps)
386+
"UUID[]"
387+
else
388+
"UUID[" * join([format_uuid(d, uuid_constants) for d in info.weakdeps], ", ") * "]"
389+
end
390+
391+
return """(
392+
name = "$(info.name)",
393+
uuid = $(format_uuid(info.uuid, uuid_constants)),
394+
deps = $(deps_str),
395+
weakdeps = $(weakdeps_str),
396+
)"""
397+
end
398+
399+
# Convert versions_dict to the format expected by compression functions
400+
# Convert tuples (name, version, deps, weakdeps) to StdlibInfo objects
401+
stdlibs_by_version = [
402+
v => Dict{UUID, StdlibInfo}(
403+
uuid => StdlibInfo(info[1], uuid, info[2], info[3], info[4])
404+
for (uuid, info) in stdlib_dict
405+
)
406+
for (v, stdlib_dict) in [(v, versions_dict[v]) for v in sort(collect(keys(versions_dict)))]
407+
]
408+
409+
# Analyze and prepare compression
410+
@info("Analyzing version map for compression...")
411+
uuid_counts, uuid_to_name = analyze_uuid_usage(stdlibs_by_version)
412+
413+
# Define constants for frequently used UUIDs
414+
const_threshold = 5
415+
uuid_constants = Dict{UUID, String}()
416+
417+
for (uuid, count) in uuid_counts
418+
if count >= const_threshold && haskey(uuid_to_name, uuid)
419+
name = uuid_to_name[uuid]
420+
const_name = "$(name)_uuid"
421+
const_name = replace(const_name, r"[^a-zA-Z0-9_]" => "_")
422+
uuid_constants[uuid] = const_name
273423
end
274-
print(io, " "^(max(indent - 4, 0)), ")")
275424
end
276425

426+
uuid_to_versions = group_by_uuid(stdlibs_by_version)
427+
uuid_to_segments = analyze_stdlib_patterns(uuid_to_versions)
428+
429+
# Output compressed version map
277430
output_fname = joinpath(dirname(dirname(@__DIR__)), "src", "version_map.jl")
278-
@info("Outputting to $(output_fname)")
279-
sorted_versions = sort(collect(keys(versions_dict)))
431+
@info("Outputting compressed version map to $(output_fname)")
280432
open(output_fname, "w") do io
281-
print(io, """
282-
## This file autogenerated by ext/HistoricalStdlibGenerator/generate_historical_stdlibs.jl
433+
println(io, "## This file autogenerated by ext/HistoricalStdlibGenerator/generate_historical_stdlibs.jl")
434+
println(io)
435+
println(io, "# Julia standard libraries with segment-based compression:")
436+
println(io, "# - Each stdlib split into segments where base info (name, uuid, deps, weakdeps) is constant")
437+
println(io, "# - Within each segment, only package version numbers stored per Julia version range")
438+
println(io)
283439

284-
# Julia standard libraries with duplicate entries removed so as to store only the
285-
# first release in a set of releases that all contain the same set of stdlibs.
286-
const STDLIBS_BY_VERSION = [
287-
""")
288-
for v in sorted_versions
289-
print(io, " $(repr(v)) => ")
290-
print_sorted(io, versions_dict[v]; indent=8)
291-
println(io, ",")
440+
# Write UUID constants
441+
if !isempty(uuid_constants)
442+
println(io, "# UUID constants")
443+
for (uuid, const_name) in sort(collect(uuid_constants), by=x->x[2])
444+
println(io, "const $(const_name) = UUID(\"$(uuid)\")")
445+
end
292446
println(io)
293447
end
294-
println(io, "]")
295448

449+
# Write stdlib info with segments
450+
println(io, "# Format: UUID => [(base_info, [(julia_version_range, package_version)]), ...]")
451+
println(io, "const STDLIB_SEGMENTS = Dict{UUID, Vector{Tuple{NamedTuple, Vector{Pair{Tuple{VersionNumber,VersionNumber}, Union{Nothing,VersionNumber}}}}}}(")
452+
453+
sorted_uuids = sort(collect(keys(uuid_to_segments)), by=u->uuid_to_name[u])
454+
455+
for (idx, uuid) in enumerate(sorted_uuids)
456+
segments = uuid_to_segments[uuid]
457+
uuid_str = format_uuid(uuid, uuid_constants)
458+
459+
println(io, " $(uuid_str) => [")
460+
461+
for (seg_idx, (base_info, version_ranges)) in enumerate(segments)
462+
print(io, " (")
463+
println(io, format_base_info(base_info, uuid_constants), ",")
464+
println(io, " [")
465+
466+
for (range_idx, ((start_v, end_v), ver)) in enumerate(version_ranges)
467+
ver_str = isnothing(ver) ? "nothing" : "v\"$(ver)\""
468+
comma = range_idx < length(version_ranges) ? "," : ""
469+
if start_v == end_v
470+
println(io, " (v\"$(start_v)\", v\"$(start_v)\") => $(ver_str)$(comma)")
471+
else
472+
println(io, " (v\"$(start_v)\", v\"$(end_v)\") => $(ver_str)$(comma)")
473+
end
474+
end
475+
476+
print(io, " ])")
477+
println(io, seg_idx < length(segments) ? "," : "")
478+
end
479+
480+
print(io, " ]")
481+
println(io, idx < length(sorted_uuids) ? "," : "")
482+
end
483+
484+
println(io, ")")
296485
println(io)
486+
487+
# Write UNREGISTERED_STDLIBS
297488
print(io, """
298489
# Next, we also embed a list of stdlibs that must _always_ be treated as stdlibs,
299490
# because they cannot be resolved in the registry; they have only ever existed within
300491
# the Julia stdlib source tree, and because of that, trying to resolve them will fail.
301-
const UNREGISTERED_STDLIBS =""")
302-
print_sorted(io, unregistered_stdlibs; indent=4)
492+
""")
493+
println(io, "const UNREGISTERED_STDLIBS = Dict{UUID,StdlibInfo}(")
494+
for (idx, (uuid, info)) in enumerate(sort(collect(unregistered_stdlibs), by=x->x[2].name))
495+
uuid_str = format_uuid(uuid, uuid_constants)
496+
deps_str = isempty(info.deps) ? "UUID[]" : "UUID[" * join([format_uuid(d, uuid_constants) for d in info.deps], ", ") * "]"
497+
weakdeps_str = isempty(info.weakdeps) ? "UUID[]" : "UUID[" * join([format_uuid(d, uuid_constants) for d in info.weakdeps], ", ") * "]"
498+
ver_str = isnothing(info.version) ? "nothing" : "v\"$(info.version)\""
499+
500+
println(io, " $(uuid_str) => StdlibInfo(")
501+
println(io, " \"$(info.name)\",")
502+
println(io, " $(uuid_str),")
503+
println(io, " $(ver_str),")
504+
println(io, " $(deps_str),")
505+
println(io, " $(weakdeps_str),")
506+
println(io, " )", idx < length(unregistered_stdlibs) ? "," : "")
507+
end
508+
println(io, ")")
303509
end

src/HistoricalStdlibVersions.jl

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,8 @@ module HistoricalStdlibVersions
77
using Pkg
88
using PrecompileTools: @setup_workload, @compile_workload
99
include("StdlibInfo.jl")
10-
include("version_map.jl")
10+
include("version_map_compressed.jl")
11+
include("uncompress.jl")
1112

1213
let
1314
max_hsg_version = maximum(first.(STDLIBS_BY_VERSION))

src/uncompress.jl

Lines changed: 54 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,54 @@
1+
2+
# Build STDLIBS_BY_UUID from segments
3+
function _build_stdlibs_by_uuid()
4+
result = Dict{UUID, Vector{Pair{Tuple{VersionNumber,VersionNumber}, StdlibInfo}}}()
5+
6+
for (uuid, segments) in STDLIB_SEGMENTS
7+
result[uuid] = []
8+
for (base_info, version_ranges) in segments
9+
for (range, ver) in version_ranges
10+
push!(result[uuid], range => StdlibInfo(
11+
base_info.name,
12+
base_info.uuid,
13+
ver,
14+
base_info.deps,
15+
base_info.weakdeps,
16+
))
17+
end
18+
end
19+
end
20+
21+
return result
22+
end
23+
24+
const STDLIBS_BY_UUID = _build_stdlibs_by_uuid()
25+
26+
# Convert to version-indexed format for Pkg
27+
function _build_stdlibs_by_version()
28+
all_versions = Set{VersionNumber}()
29+
for ranges in values(STDLIBS_BY_UUID)
30+
for ((start_v, end_v), _) in ranges
31+
push!(all_versions, start_v)
32+
push!(all_versions, end_v)
33+
end
34+
end
35+
36+
result = Pair{VersionNumber, Dict{UUID,StdlibInfo}}[]
37+
for version in sort(collect(all_versions))
38+
stdlib_dict = Dict{UUID,StdlibInfo}()
39+
for (uuid, ranges) in STDLIBS_BY_UUID
40+
for ((start_v, end_v), info) in ranges
41+
if start_v <= version <= end_v
42+
stdlib_dict[uuid] = info
43+
break
44+
end
45+
end
46+
end
47+
if !isempty(stdlib_dict)
48+
push!(result, version => stdlib_dict)
49+
end
50+
end
51+
return result
52+
end
53+
54+
const STDLIBS_BY_VERSION = _build_stdlibs_by_version()

0 commit comments

Comments
 (0)