Skip to content

Commit 34e555e

Browse files
KristofferCKristofferC
authored andcommitted
compress the version map a bit
we use a compression where for a given stdlib with given dependencies we associate that with a julia range. Within that julia range we also store how the version of the stdlib itself has evolved. The compression and uncompression code were written by an LLM after my instruction of how it should be compressed.
1 parent 047f9f7 commit 34e555e

File tree

7 files changed

+2140
-15375
lines changed

7 files changed

+2140
-15375
lines changed

ext/HistoricalStdlibGenerator/Project.toml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2,5 +2,6 @@
22
Downloads = "f43a241f-c20a-4ad4-852c-f6b1247861c6"
33
JSON3 = "0f8b85d8-7281-11e9-16c2-39a750bddbf1"
44
Pkg = "44cfe95a-1eb2-52ea-b672-e2afdf69b78f"
5+
Runic = "e39a0499-db5f-4c45-8b15-811f5c0f86e3"
56
SHA = "ea8e919c-243c-51af-8825-aaa63cd721ce"
67
Scratch = "6c6a2e73-6563-6170-7368-637461726353"
Lines changed: 244 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,244 @@
1+
# Compression functions for stdlib version data
2+
3+
# Analyze UUID usage for constants
4+
function analyze_uuid_usage(stdlibs_by_version)
5+
uuid_counts = Dict{UUID, Int}()
6+
uuid_to_name = Dict{UUID, String}()
7+
8+
for (_, stdlib_dict) in stdlibs_by_version
9+
for (uuid, info) in stdlib_dict
10+
uuid_counts[uuid] = get(uuid_counts, uuid, 0) + 1
11+
uuid_to_name[uuid] = info.name
12+
13+
for dep_uuid in info.deps
14+
uuid_counts[dep_uuid] = get(uuid_counts, dep_uuid, 0) + 1
15+
end
16+
17+
for weakdep_uuid in info.weakdeps
18+
uuid_counts[weakdep_uuid] = get(uuid_counts, weakdep_uuid, 0) + 1
19+
end
20+
end
21+
end
22+
23+
return uuid_counts, uuid_to_name
24+
end
25+
26+
# Group by UUID
27+
function group_by_uuid(stdlibs_by_version)
28+
uuid_to_versions = Dict{UUID, Vector{Tuple{VersionNumber, StdlibInfo}}}()
29+
30+
for (version, stdlib_dict) in stdlibs_by_version
31+
for (uuid, info) in stdlib_dict
32+
if !haskey(uuid_to_versions, uuid)
33+
uuid_to_versions[uuid] = []
34+
end
35+
push!(uuid_to_versions[uuid], (version, info))
36+
end
37+
end
38+
39+
for (uuid, versions) in uuid_to_versions
40+
sort!(versions, by=x->x[1])
41+
end
42+
43+
return uuid_to_versions
44+
end
45+
46+
# Compare base info (everything except version)
47+
function base_info_equal(a::StdlibInfo, b::StdlibInfo)
48+
a.name == b.name &&
49+
a.uuid == b.uuid &&
50+
a.deps == b.deps &&
51+
a.weakdeps == b.weakdeps
52+
end
53+
54+
# Find segments where base info is constant
55+
function find_base_info_segments(versions_and_infos)
56+
segments = []
57+
i = 1
58+
59+
while i <= length(versions_and_infos)
60+
segment_start_idx = i
61+
segment_base_info = versions_and_infos[i][2]
62+
j = i + 1
63+
64+
# Find consecutive entries with same base info
65+
while j <= length(versions_and_infos)
66+
if base_info_equal(versions_and_infos[j][2], segment_base_info)
67+
j += 1
68+
else
69+
break
70+
end
71+
end
72+
73+
segment_end_idx = j - 1
74+
segment_entries = versions_and_infos[segment_start_idx:segment_end_idx]
75+
76+
# Within this segment, find version ranges based on package version
77+
version_ranges = []
78+
k = 1
79+
while k <= length(segment_entries)
80+
range_start_version = segment_entries[k][1]
81+
current_pkg_version = segment_entries[k][2].version
82+
m = k + 1
83+
84+
while m <= length(segment_entries)
85+
if segment_entries[m][2].version == current_pkg_version
86+
m += 1
87+
else
88+
break
89+
end
90+
end
91+
92+
range_end_version = segment_entries[m-1][1]
93+
push!(version_ranges, ((range_start_version, range_end_version), current_pkg_version))
94+
k = m
95+
end
96+
97+
push!(segments, (segment_base_info, version_ranges))
98+
i = j
99+
end
100+
101+
return segments
102+
end
103+
104+
# Analyze stdlib patterns with segmentation
105+
function analyze_stdlib_patterns(uuid_to_versions)
106+
uuid_to_segments = Dict{UUID, Vector}()
107+
108+
for (uuid, versions_and_infos) in uuid_to_versions
109+
segments = find_base_info_segments(versions_and_infos)
110+
uuid_to_segments[uuid] = segments
111+
end
112+
113+
return uuid_to_segments
114+
end
115+
116+
# Format UUID reference
117+
function format_uuid(uuid, uuid_constants)
118+
get(uuid_constants, uuid, "UUID(\"$(uuid)\")")
119+
end
120+
121+
# Format base info tuple
122+
function format_base_info(info, uuid_constants)
123+
deps_str = if isempty(info.deps)
124+
"UUID[]"
125+
else
126+
"UUID[" * join([format_uuid(d, uuid_constants) for d in info.deps], ", ") * "]"
127+
end
128+
129+
weakdeps_str = if isempty(info.weakdeps)
130+
"UUID[]"
131+
else
132+
"UUID[" * join([format_uuid(d, uuid_constants) for d in info.weakdeps], ", ") * "]"
133+
end
134+
135+
return """(
136+
name = "$(info.name)",
137+
uuid = $(format_uuid(info.uuid, uuid_constants)),
138+
deps = $(deps_str),
139+
weakdeps = $(weakdeps_str),
140+
)"""
141+
end
142+
143+
# Write compressed version map to file
144+
function write_compressed_version_map(output_fname, stdlibs_by_version, unregistered_stdlibs)
145+
# Analyze and prepare compression
146+
@info("Analyzing version map for compression...")
147+
uuid_counts, uuid_to_name = analyze_uuid_usage(stdlibs_by_version)
148+
149+
# Define constants for frequently used UUIDs
150+
const_threshold = 5
151+
uuid_constants = Dict{UUID, String}()
152+
153+
for (uuid, count) in uuid_counts
154+
if count >= const_threshold && haskey(uuid_to_name, uuid)
155+
name = uuid_to_name[uuid]
156+
const_name = "$(name)_uuid"
157+
const_name = replace(const_name, r"[^a-zA-Z0-9_]" => "_")
158+
uuid_constants[uuid] = const_name
159+
end
160+
end
161+
162+
uuid_to_versions = group_by_uuid(stdlibs_by_version)
163+
uuid_to_segments = analyze_stdlib_patterns(uuid_to_versions)
164+
165+
# Output compressed version map
166+
@info("Outputting compressed version map to $(output_fname)")
167+
open(output_fname, "w") do io
168+
println(io, "## This file autogenerated by ext/HistoricalStdlibGenerator/generate_historical_stdlibs.jl")
169+
println(io)
170+
println(io, "# Julia standard libraries with segment-based compression:")
171+
println(io, "# - Each stdlib split into segments where base info (name, uuid, deps, weakdeps) is constant")
172+
println(io, "# - Within each segment, only package version numbers stored per Julia version range")
173+
println(io)
174+
175+
# Write UUID constants
176+
if !isempty(uuid_constants)
177+
println(io, "# UUID constants")
178+
for (uuid, const_name) in sort(collect(uuid_constants), by=x->x[2])
179+
println(io, "const $(const_name) = UUID(\"$(uuid)\")")
180+
end
181+
println(io)
182+
end
183+
184+
# Write stdlib info with segments
185+
println(io, "# Format: UUID => [(base_info, [(julia_version_range, package_version)]), ...]")
186+
println(io, "const STDLIB_SEGMENTS = Dict{UUID, Vector{Tuple{NamedTuple, Vector{Pair{Tuple{VersionNumber,VersionNumber}, Union{Nothing,VersionNumber}}}}}}(")
187+
188+
sorted_uuids = sort(collect(keys(uuid_to_segments)), by=u->uuid_to_name[u])
189+
190+
for (idx, uuid) in enumerate(sorted_uuids)
191+
segments = uuid_to_segments[uuid]
192+
uuid_str = format_uuid(uuid, uuid_constants)
193+
194+
println(io, " $(uuid_str) => [")
195+
196+
for (seg_idx, (base_info, version_ranges)) in enumerate(segments)
197+
println(io, " (", format_base_info(base_info, uuid_constants), ",")
198+
println(io, " [")
199+
200+
for (range_idx, ((start_v, end_v), ver)) in enumerate(version_ranges)
201+
ver_str = isnothing(ver) ? "nothing" : "v\"$(ver)\""
202+
comma = range_idx < length(version_ranges) ? "," : ""
203+
if start_v == end_v
204+
println(io, " (v\"$(start_v)\", v\"$(start_v)\") => $(ver_str)$(comma)")
205+
else
206+
println(io, " (v\"$(start_v)\", v\"$(end_v)\") => $(ver_str)$(comma)")
207+
end
208+
end
209+
210+
print(io, " ])")
211+
println(io, seg_idx < length(segments) ? "," : "")
212+
end
213+
214+
print(io, " ]")
215+
println(io, idx < length(sorted_uuids) ? "," : "")
216+
end
217+
218+
println(io, ")")
219+
println(io)
220+
221+
# Write UNREGISTERED_STDLIBS
222+
print(io, """
223+
# Next, we also embed a list of stdlibs that must _always_ be treated as stdlibs,
224+
# because they cannot be resolved in the registry; they have only ever existed within
225+
# the Julia stdlib source tree, and because of that, trying to resolve them will fail.
226+
""")
227+
println(io, "const UNREGISTERED_STDLIBS = Dict{UUID,StdlibInfo}(")
228+
for (idx, (uuid, info)) in enumerate(sort(collect(unregistered_stdlibs), by=x->x[2].name))
229+
uuid_str = format_uuid(uuid, uuid_constants)
230+
deps_str = isempty(info.deps) ? "UUID[]" : "UUID[" * join([format_uuid(d, uuid_constants) for d in info.deps], ", ") * "]"
231+
weakdeps_str = isempty(info.weakdeps) ? "UUID[]" : "UUID[" * join([format_uuid(d, uuid_constants) for d in info.weakdeps], ", ") * "]"
232+
ver_str = isnothing(info.version) ? "nothing" : "v\"$(info.version)\""
233+
234+
println(io, " $(uuid_str) => StdlibInfo(")
235+
println(io, " \"$(info.name)\",")
236+
println(io, " $(uuid_str),")
237+
println(io, " $(ver_str),")
238+
println(io, " $(deps_str),")
239+
println(io, " $(weakdeps_str),")
240+
println(io, " )", idx < length(unregistered_stdlibs) ? "," : "")
241+
end
242+
println(io, ")")
243+
end
244+
end

ext/HistoricalStdlibGenerator/generate_historical_stdlibs.jl

Lines changed: 22 additions & 43 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,8 @@
11
#!/usr/bin/env julia
22

3-
using Downloads, JSON3, Base.BinaryPlatforms, Scratch, SHA, Pkg, TOML
3+
using Downloads, JSON3, Base.BinaryPlatforms, Scratch, SHA, Pkg, TOML, Runic
44
include("../../src/StdlibInfo.jl")
5+
include("compress.jl")
56

67
# Work around issues where we attempt to `eval()` code from Julia versions
78
# that have `Pkg.Types.StdlibInfo` (and embed that exact symbol path)
@@ -256,48 +257,26 @@ unregistered_stdlibs = filter(all_stdlibs) do (uuid, _)
256257
return !any(haskey(reg.pkgs, uuid) for reg in registries)
257258
end
258259

259-
# Helper function for getting these printed out in a nicely-sorted order
260-
function print_sorted(io::IO, d::Dict; indent::Int=0)
261-
println(io, "Dict{UUID,StdlibInfo}(")
262-
for (uuid, (name, version, deps, weakdeps)) in sort(collect(d), by = kv-> kv[2][1])
263-
println(io,
264-
" "^indent,
265-
repr(uuid), " => StdlibInfo(\n",
266-
" "^(indent + 4), repr(name), ",\n",
267-
" "^(indent + 4), repr(uuid), ",\n",
268-
" "^(indent + 4), repr(version), ",\n",
269-
" "^(indent + 4), repr(sort(deps)), ",\n",
270-
" "^(indent + 4), repr(sort(weakdeps)), ",\n",
271-
" "^indent, "),",
272-
)
273-
end
274-
print(io, " "^(max(indent - 4, 0)), ")")
275-
end
260+
# Convert versions_dict to the format expected by compression functions
261+
# Convert tuples (name, version, deps, weakdeps) to StdlibInfo objects
262+
stdlibs_by_version = [
263+
v => Dict{UUID, StdlibInfo}(
264+
uuid => StdlibInfo(info[1], uuid, info[2], info[3], info[4])
265+
for (uuid, info) in stdlib_dict
266+
)
267+
for (v, stdlib_dict) in [(v, versions_dict[v]) for v in sort(collect(keys(versions_dict)))]
268+
]
276269

277-
output_fname = joinpath(dirname(dirname(@__DIR__)), "src", "version_map.jl")
278-
@info("Outputting to $(output_fname)")
279-
sorted_versions = sort(collect(keys(versions_dict)))
280-
open(output_fname, "w") do io
281-
print(io, """
282-
## This file autogenerated by ext/HistoricalStdlibGenerator/generate_historical_stdlibs.jl
270+
# Convert unregistered_stdlibs tuples to StdlibInfo objects
271+
unregistered_stdlibs_info = Dict{UUID, StdlibInfo}(
272+
uuid => StdlibInfo(info[1], uuid, info[2], info[3], info[4])
273+
for (uuid, info) in unregistered_stdlibs
274+
)
283275

284-
# Julia standard libraries with duplicate entries removed so as to store only the
285-
# first release in a set of releases that all contain the same set of stdlibs.
286-
const STDLIBS_BY_VERSION = [
287-
""")
288-
for v in sorted_versions
289-
print(io, " $(repr(v)) => ")
290-
print_sorted(io, versions_dict[v]; indent=8)
291-
println(io, ",")
292-
println(io)
293-
end
294-
println(io, "]")
276+
# Write compressed version map
277+
output_fname = joinpath(dirname(dirname(@__DIR__)), "src", "version_map_compressed.jl")
278+
write_compressed_version_map(output_fname, stdlibs_by_version, unregistered_stdlibs_info)
295279

296-
println(io)
297-
print(io, """
298-
# Next, we also embed a list of stdlibs that must _always_ be treated as stdlibs,
299-
# because they cannot be resolved in the registry; they have only ever existed within
300-
# the Julia stdlib source tree, and because of that, trying to resolve them will fail.
301-
const UNREGISTERED_STDLIBS =""")
302-
print_sorted(io, unregistered_stdlibs; indent=4)
303-
end
280+
# Format the output file with Runic
281+
@info("Formatting output file with Runic...")
282+
Runic.format(output_fname; overwrite=true)

src/HistoricalStdlibVersions.jl

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,8 @@ module HistoricalStdlibVersions
77
using Pkg
88
using PrecompileTools: @setup_workload, @compile_workload
99
include("StdlibInfo.jl")
10-
include("version_map.jl")
10+
include("version_map_compressed.jl")
11+
include("uncompress.jl")
1112

1213
let
1314
max_hsg_version = maximum(first.(STDLIBS_BY_VERSION))

src/uncompress.jl

Lines changed: 54 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,54 @@
1+
2+
# Build STDLIBS_BY_UUID from segments
3+
function _build_stdlibs_by_uuid()
4+
result = Dict{UUID, Vector{Pair{Tuple{VersionNumber,VersionNumber}, StdlibInfo}}}()
5+
6+
for (uuid, segments) in STDLIB_SEGMENTS
7+
result[uuid] = []
8+
for (base_info, version_ranges) in segments
9+
for (range, ver) in version_ranges
10+
push!(result[uuid], range => StdlibInfo(
11+
base_info.name,
12+
base_info.uuid,
13+
ver,
14+
base_info.deps,
15+
base_info.weakdeps,
16+
))
17+
end
18+
end
19+
end
20+
21+
return result
22+
end
23+
24+
const STDLIBS_BY_UUID = _build_stdlibs_by_uuid()
25+
26+
# Convert to version-indexed format for Pkg
27+
function _build_stdlibs_by_version()
28+
all_versions = Set{VersionNumber}()
29+
for ranges in values(STDLIBS_BY_UUID)
30+
for ((start_v, end_v), _) in ranges
31+
push!(all_versions, start_v)
32+
push!(all_versions, end_v)
33+
end
34+
end
35+
36+
result = Pair{VersionNumber, Dict{UUID,StdlibInfo}}[]
37+
for version in sort(collect(all_versions))
38+
stdlib_dict = Dict{UUID,StdlibInfo}()
39+
for (uuid, ranges) in STDLIBS_BY_UUID
40+
for ((start_v, end_v), info) in ranges
41+
if start_v <= version <= end_v
42+
stdlib_dict[uuid] = info
43+
break
44+
end
45+
end
46+
end
47+
if !isempty(stdlib_dict)
48+
push!(result, version => stdlib_dict)
49+
end
50+
end
51+
return result
52+
end
53+
54+
const STDLIBS_BY_VERSION = _build_stdlibs_by_version()

0 commit comments

Comments
 (0)