-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathrun_distances.py
More file actions
235 lines (192 loc) · 7.81 KB
/
Copy pathrun_distances.py
File metadata and controls
235 lines (192 loc) · 7.81 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
import os
import glob
import torch
import numpy as np
from tqdm import tqdm
from train_utils import get_dataset
from simple_parsing import ArgumentParser
from pathlib import Path
from stats_utils import (
sort_files,
create_dirs,
get_train_loader,
get_metric_fn,
)
from torch.utils.data import DataLoader
"""Script for computing and storing memorized and spurious ratios along with the indices for nearest neighbors.
Example usage:
python run_distances.py \
--result-path distances \
--synth-path cifar10-synths/ \
--data-path data/ \
--use-lpips
--k 5
--network vgg
You could also use a config file instead and provide additional modifications to your arguments if needed.
python run_distances.py --config_path=my_config.py --data-path data/
"""
def dist_fn(sample, ref_set, metric_fn, batch_size, k, device="cuda"):
"""
Compute Pair-wise Distances of elements in a given target set to a reference set
Args:
sample: Tensor of size B x ...
ref_set: Tensor of size M x ...
metric_fn: a function which takes x, y for computing their distance to each other
batch_size: the batch size for the reference set
k: the number of nearest neighbors
Returns:
top_dist: B x k
top_indices: B x k
"""
same_set = True
if type(ref_set) is not DataLoader: # If given a synthetic set
dists = torch.full([len(sample), len(ref_set)], float("inf"), device=device)
for i in range(0, len(ref_set), batch_size):
j = min(i + batch_size, len(ref_set))
ref = torch.from_numpy(ref_set[i:j]).to(device)
dists[:, i:j] = metric_fn(sample, ref)
else: # If given a training set
i = 0
same_set = False
dists = torch.full(
[len(sample), len(ref_set.dataset)], float("inf"), device=device
)
for ref, _ in ref_set:
j = min(i + len(ref), len(ref_set.dataset))
dists[:, i:j] = metric_fn(sample, ref.to(device))
i = j
k = min(
dists.shape[1], k
) # for the case in which the number of samples is less than k
if same_set: # when we are computing synthetic to synthetic
sk = min(k + 1, dists.shape[1])
top_dists, top_idx = dists.topk(sk, dim=1, largest=False)
return top_dists[:, 1:], top_idx[:, 1:] # remove first column
return dists.topk(k, dim=1, largest=False)
def get_top_dists(eval_set, ref_set, metric_fn, batch_sizes, k=1_000, device="cuda"):
"""
Compute Pair-wise Distances of elements in a given target set to a reference set
Args:
eval_set: Tensor of size B x ...
ref_set: Tensor of size M x ...
metric_fn: a function which takes x, y for computing their distance to each other
k: the number of nearest neighbors
Returns:
top_dist: B x k
top_indices: B x k
"""
top_dists, top_indices = [], []
eval_batch_size, ref_batch_size = batch_sizes
ref_batch_size = eval_batch_size if ref_batch_size is None else ref_batch_size
for i in range(0, len(eval_set), eval_batch_size):
j = min(i + eval_batch_size, len(eval_set))
target = torch.from_numpy(eval_set[i:j]).to(device)
top_dist, top_index = dist_fn(
target, ref_set, metric_fn, ref_batch_size, k, device
)
top_dists.append(top_dist)
top_indices.append(top_index)
top_dists, top_indices = map(torch.cat, (top_dists, top_indices))
return top_dists.cpu(), top_indices.cpu()
def main(args):
result_path = args.result_path
create_dirs([result_path])
# Grab .npz files for evaluation
synth_files = glob.glob(os.path.join(args.synth_path, "*.npz"))
synth_files = sort_files(
synth_files
) # sort by name (or prefix converted into integer)
# Slice the number of files for evaluation
final_idx = len(synth_files) if args.final_idx == -1 else args.final_idx
synth_files = synth_files[args.start_idx : final_idx]
print(*synth_files, sep="\n")
dataset = None
metric_fn = get_metric_fn(args.use_lpips, network=args.network)
lpips = "lpips" if args.use_lpips else "l2"
backbone = args.network if args.use_lpips else "none"
print(f"Using {lpips} with backbone: {backbone}")
ref_batch_size = args.ref_batch_size
eval_batch_size = args.eval_batch_size
ref_batch_size = eval_batch_size if ref_batch_size is None else eval_batch_size
# Go through each synthetic file and compute the distances
for synth_path in tqdm(synth_files):
synth_ckpt = np.load(synth_path, allow_pickle=True)
ckpt_args = synth_ckpt["args"].item()
# load the entire dataset a single time
if dataset is None:
dataset = get_dataset(
args.data_path,
ckpt_args.data.data_name,
(
None
if not (ckpt_args.train.centercrop)
else ckpt_args.model.image_size
),
)
# split the dataset into a subset of K training samples
train_loader = get_train_loader(dataset, ref_batch_size, ckpt_args)
data_size = len(train_loader.dataset)
save_path = os.path.join(result_path, f"{data_size}")
if not (os.path.exists(save_path + ".npz")) or args.overwrite:
synth_set = synth_ckpt["samples"]
batch_sizes = (eval_batch_size, ref_batch_size)
# compute synthetic to training set distances
data_dists, data_indices = get_top_dists(
synth_set, train_loader, metric_fn, batch_sizes, args.k
)
# compute synthetic to synthetic distances
synth_dists, synth_indices = get_top_dists(
synth_set, synth_set, metric_fn, batch_sizes, args.k
)
# save the results
results = {
"data-dists": data_dists.numpy(), # synthetic to training set distances
"synth-dists": synth_dists.numpy(), # synthetic to synthetic distances
"data-indices": data_indices.numpy(),
"synth-indices": synth_indices.numpy(),
}
np.savez_compressed(save_path, **results)
if __name__ == "__main__":
parser = ArgumentParser(add_config_path_arg=True)
parser.add_argument(
"--result-path",
type=str,
help="Path to stored the results. If specified None, then results are stored in the same path as ckpt path.",
)
parser.add_argument(
"--synth-path", type=str, help="Path to evaluation files (saved as .npz)."
)
parser.add_argument("--data-path", type=str, help="Path to the dataset folder.")
parser.add_argument(
"--eval-batch-size", type=int, default=256, help="Batch size for evaluation."
)
parser.add_argument(
"--ref-batch-size",
type=int,
default=None,
help="Batch size for reference set. If None, it is defaulted to eval_batch_size.",
)
parser.add_argument(
"--use-lpips", action="store_true", help="Use LPIPS for distance."
)
parser.add_argument(
"--network", type=str, default="alex", help="Backbone for LPIPS."
)
parser.add_argument("--k", type=int, default=5, help="Number of nearest neighbors.")
parser.add_argument(
"--start-idx",
type=int,
default=0,
help="Starting Index use for slicing the set of files we have to compute over.",
)
parser.add_argument(
"--final-idx",
type=int,
default=-1,
help="Ending Index use for slicing the set of files we have to compute over.",
)
parser.add_argument(
"--overwrite", action="store_true", help="Overwrite existing results."
)
args = parser.parse_args()
main(args)