Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
53 commits
Select commit Hold shift + click to select a range
fda4ffa
added testing file
rash92 Jan 30, 2024
523b57b
added physics dataset to texts, added logging for ff_scores and ff_cr…
rash92 Jan 30, 2024
c45c8ec
saving and loading tensors
rash92 Feb 3, 2024
178fd65
pruning and comparing working
rash92 Feb 3, 2024
43c4c3a
removed saved tensor files, runs to find common pruning working on ti…
rash92 Feb 4, 2024
24b3006
added datasets
rash92 Feb 9, 2024
450aa48
tried some changes to pruningConfig that will hopefully fix issues wi…
rash92 Feb 10, 2024
598f998
runs working but memory problems when doing too much at once, 0.01 sa…
rash92 Feb 14, 2024
328ec62
setup for extra repos done, need to run for ff_frac = 0.01 then use g…
rash92 Feb 14, 2024
d0fbc19
added a bunch of dataset tensors
rash92 Feb 15, 2024
313e928
added datasets
rash92 Feb 15, 2024
d053684
added datasets
rash92 Feb 15, 2024
78c5cfa
dataset tensors done
rash92 Feb 15, 2024
1f11ddc
able to generate pruning ratios quickly using existing files
rash92 Feb 16, 2024
cfc5805
moved functions around, cleaned up old unused code
rash92 Feb 16, 2024
70b99bd
moved functions around, cleaned up old unused code
rash92 Feb 16, 2024
d2a5b05
typo after deleting legacy code fixed
rash92 Feb 16, 2024
911031b
moved plot function to own file
rash92 Feb 16, 2024
d456c6b
created file with saved pruning ratios
rash92 Feb 16, 2024
67e0b85
plots added
rash92 Feb 16, 2024
fb15b66
reordered repos in plot slightly
rash92 Feb 16, 2024
aa53b71
Tensors from random text model
TetraspaceW Mar 3, 2024
9aaf2b0
reevaluation functions done but not tested
rash92 Mar 10, 2024
2f038d9
Merge branch 'rashid' of https://github.com/rash92/taker into rashid
rash92 Mar 10, 2024
9d81002
functions mostly working, testing and optimising for speed
rash92 Mar 10, 2024
2c6ad6c
relative accuracy
rash92 Mar 10, 2024
03ecb34
fixed error with find accuracy function
rash92 Mar 10, 2024
d3eb7e9
added safeguards to find accuracy, added timer, added extra logging
rash92 Mar 10, 2024
61202d0
added more timers to check if worth optimising
rash92 Mar 10, 2024
0abd88f
seems to be working, tested on sample of 1k, now to try 10k
rash92 Mar 10, 2024
faca774
fixed typos in dataset names
rash92 Mar 10, 2024
9cb4345
changed timers
rash92 Mar 10, 2024
f0b5b91
added saving answer to file
rash92 Mar 10, 2024
efe6849
added pile related datasets with sample of 10k
rash92 Mar 11, 2024
698f470
renamed file, formatting
rash92 Mar 11, 2024
6571c36
removed print, changed ff_frac precision, using all datasets with fil…
rash92 Mar 12, 2024
25a577c
saved results
rash92 Mar 12, 2024
5d29476
merged src changes from upstrea
rash92 Mar 14, 2024
1d5fc1b
testing activation observations
rash92 Mar 25, 2024
87ee50b
Merge branch 'rashid' of https://github.com/rash92/taker into rashid
rash92 Mar 25, 2024
f645a13
started
rash92 Mar 25, 2024
6c6548c
file saving/loading funcs
rash92 Mar 26, 2024
a316b09
stuck
rash92 Mar 27, 2024
172cb5a
merged with updates to original taker
rash92 Apr 3, 2024
ab2f5b7
added id recording
rash92 Apr 3, 2024
5eb26f1
changed image transformer related issues from merge, deleted evalconf…
rash92 Apr 3, 2024
6d74f44
getting data in nice dict
rash92 Apr 3, 2024
239335d
saved physics test tensors
rash92 Apr 3, 2024
c40bc8d
added phate visualisation notebook
rash92 Apr 10, 2024
ee4d75b
changed activation to do 1 file per dataset
rash92 Apr 10, 2024
f9b0a47
changed activation to do 1 file per dataset
rash92 Apr 10, 2024
b58abf2
fixed return value
rash92 Apr 10, 2024
712af22
fixed variable name
rash92 Apr 10, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
133 changes: 133 additions & 0 deletions examples/neuron-mapping/activation_observations.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,133 @@
import torch
import os
import numpy as np
import random
from datetime import datetime
from taker.eval import run_evaluation
from taker.data_classes import PruningConfig, EvalConfig, RunDataHistory
from taker.activations import get_top_frac, get_midlayer_activations
from taker.model import Model
from taker.texts import infer_dataset_config
from taker.eval import evaluate_all

print("starting activation observations")

c = PruningConfig(
wandb_project = "testing", # repo to push results to
# model_repo = "nickypro/tinyllama-15M",
# model_repo = "facebook/opt-1.3b",
model_repo = "nickypro/llama-7b-hf-rand",
token_limit = 1000, # trim the input to this max length
run_pre_test = False, # evaluate the unpruned model
eval_sample_size = 1e3,
collection_sample_size = 1e3,
# Removals parameters
ff_frac = 0.01, # % of feed forward neurons to prune
attn_frac = 0.00, # % of attention neurons to prune
focus = "pile", # the “reference” dataset
cripple = "physics", # the “unlearned” dataset
additional_datasets=tuple(), # any extra datasets to evaluate on
recalculate_activations = False, # iterative vs non-iterative
dtype = "int4",
n_steps = 1,
)

def set_seed(seed):
random.seed(seed) # Python random module.
np.random.seed(seed) # Numpy module.
torch.manual_seed(seed) # PyTorch random number generator for CPU.
torch.cuda.manual_seed(seed) # PyTorch random number generator for CUDA.
torch.backends.cudnn.deterministic = True # To ensure that CUDA selects deterministic algorithms.
torch.backends.cudnn.benchmark = False

def save_data_dict( model_size: str,
data: any,
name: str ):
now = datetime.now().strftime( "%Y-%m-%d_%H:%M:%S" )
os.makedirs( f'saved_tensors/{model_size}', exist_ok=True )
filepath = f'saved_tensors/{model_size}/{name}-{model_size}-recent.pt'
torch.save( data, filepath )
print( f'Saved {filepath} to {model_size}' )
filepath = f'saved_tensors/{model_size}/{name}-{model_size}-{now}.pt'
torch.save( data, filepath )
print( f'Saved {filepath} to {model_size}' )
return filepath

def load_pt_file(filepath: str):
data = torch.load(filepath)
for key in data.keys():
print(key)
return data


def get_activations(c: PruningConfig, datasets: list[str]):
# Initilaise Model and show details about model
opt = Model(
c.model_size,
limit=c.token_limit,
dtype=c.dtype,
svd_attn=c.svd_attn,
use_accelerator=c.use_accelerator,
model_device=c.model_device,
mask_fn=c.mask_fn,
)
saved_files = []
for dataset in datasets:
midlayer_activations = get_midlayer_activations(
opt=opt,
dataset_name=dataset,
sample_size=c.collection_sample_size,
attn_mode=c.attn_mode,
collect_attn=True,
collect_ids=True,
random_subset_frac=0.01
)

results = {}
results["dataset"] = dataset
results["criteria"] = midlayer_activations.raw["criteria"]
results["attn"] = midlayer_activations.raw["attn"]
results["input_ids"] = midlayer_activations.raw["input_ids"]
results["expected_ids"] = midlayer_activations.raw["expected_ids"]

filepath = save_data_dict("llama-7b", results, f"{dataset}_activations")
print(f"file for {dataset} activations saved to: ", filepath)
saved_files.append(filepath)

return saved_files

all_datasets = ["biology",
"chemistry",
"civil",
"code",
"emotion",
"math",
"physics",
"pile_ArXiv",
"pile_Enron_Emails",
"pile_EuroParl",
"pile_FreeLaw",
"pile_Github",
"pile_Gutenberg",
"pile_HackerNews",
"pile_NIH_ExPorter",
"pile_PhilPapers",
"pile_PubMed_Abstracts",
"pile_PubMed_Central",
"pile_StackExchange",
"pile_Ubuntu_IRC",
"pile_USPTO_Backgrounds",
"pile_Wikipedia",
"poems"]

test_datasets = ["physics", "math", "code", "pile_Github", "pile_HackerNews"]

data = get_activations(c, all_datasets)

# filepath = save_data_dict("llama-7b", data, "test_activations")
# print("file saved to: ", filepath)

# filepath = "saved_tensors/llama-7b/test_activations-llama-7b-recent.pt"
# loaded_data = load_pt_file(filepath)["physics"]

# print(loaded_data["attn"])
84 changes: 59 additions & 25 deletions examples/neuron-mapping/compare.py
Original file line number Diff line number Diff line change
@@ -1,29 +1,63 @@
import torch
import datetime
import os
from taker.activations import get_top_frac

def compare_pruned_ff_criteria(cripple_repos: list[str], model_size: str, path: str="/home/ubuntu/taker-rashid/examples/neuron-mapping/saved_tensors/", focus_repo: str = "pile",):
# cripple_repos = ["physics", "bio", "code"]
directory = f"{path}{model_size}/"
suffix = "-"+model_size+"-0.01-recent.pt"

#takes repo name, model size and timestamp as strings and returns ff_scores tensor by loading relevant file
def load_tensors_for_repo(repo, model_size="hf", timestamp="recent"):
directory = "/home/ubuntu/taker-rashid/examples/neuron-mapping/saved_tensors/"+model_size+"/"
filename = repo+"-pile-"+model_size+"-"+timestamp+".pt"
data = torch.load(directory+filename)
return data["ff_scores"], data["ff_criteria"]


def get_ff_criteria_for_ff_frac(repo, ff_frac):
ff_scores, _ = load_tensors_for_repo(repo)
criteria, _ = get_top_frac(ff_scores, ff_frac)
return criteria

def compare_pruned_ff_criteria(repos, ff_fracs, model_size="hf"):
ratios = {}
ratios["model_size"] = model_size

for repo1 in cripple_repos:
#load ff_criteria from repo1
repo1_tensors = torch.load(directory+repo1+"-"+focus_repo+suffix)
repo1_ff_criteria = repo1_tensors["ff_criteria"]
ratios[repo1] = {}
for repo2 in cripple_repos:
if repo1 == repo2:
continue
#load ff_criteria from repo2
repo2_tensors = torch.load(directory+repo2+"-"+focus_repo+suffix)
repo2_ff_criteria = repo2_tensors["ff_criteria"]


matches = torch.logical_and(repo1_ff_criteria, repo2_ff_criteria)
ratio = torch.sum(matches)/torch.sum(repo1_ff_criteria)
ratios[repo1][repo2] = ratio

for ff_frac in ff_fracs:
ratios[ff_frac] = {}
for repo1 in repos:
ratios[ff_frac][repo1] = {}
ff_criteria_repo1 = get_ff_criteria_for_ff_frac(repo1, ff_frac)
for repo2 in repos:
if repo1 == repo2:
continue
ff_criteria_repo2 = get_ff_criteria_for_ff_frac(repo2, ff_frac)
matches = torch.logical_and(ff_criteria_repo1, ff_criteria_repo2)
ratio = torch.sum(matches)/torch.sum(ff_criteria_repo1)
ratios[ff_frac][repo1][repo2] = ratio

return ratios

print(compare_pruned_ff_criteria(["cifar20-trees", "cifar20-veh1", "cifar20-veh2"], "Cifar100", path="/home/ubuntu/tetra/taker/examples/neuron-mapping/saved_tensors/", focus_repo="cifar20-split"))

def load_pt_file(directory: str, filename: str):
data = torch.load(directory+filename)
for key in data.keys():
print(key)
return data

def save_pruning_data_dict( model_size: str,
data: any,
name: str ):
now = datetime.datetime.now().strftime( "%Y-%m-%d_%H:%M:%S" )
os.makedirs( f'saved_tensors/{model_size}', exist_ok=True )
filename = f'saved_tensors/{model_size}/{name}-{model_size}-recent.pt'
torch.save( data, filename )
print( f'Saved {filename} to {model_size}' )
filename = f'saved_tensors/{model_size}/{name}-{model_size}-{now}.pt'
torch.save( data, filename )
print( f'Saved {filename} to {model_size}' )
return filename



ff_fracs = [0.01, 0.02, 0.05, 0.1, 0.2, 0.5]
cripple_repos = ["biology", "chemistry", "physics", "math", "code", "poems", "civil", "emotion", "pile_FreeLaw", "pile_PubMed_Abstracts", "pile_PubMed_Central", "pile_NIH_ExPorter", "pile_Enron_Emails", "pile_Github", "pile_StackExchange", "pile_HackerNews", "pile_ArXiv", "pile_Wikipedia", "pile_Ubuntu_IRC", "pile_USPTO_Backgrounds", "pile_PhilPapers", "pile_EuroParl", "pile_Gutenberg", "pile_PhilPapers", "pile_EuroParl", "pile_Gutenberg"]

ratios = compare_pruned_ff_criteria(cripple_repos, ff_fracs)
filename = save_pruning_data_dict("hf", ratios, "pruning_ratios")
print("saved pruning ratios to: ", filename)
Loading