euler2ai/grow_coboundary_graph.py at main · RamanujanMachine/euler2ai · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
import os
import pickle
import sympy as sp
import numpy as np
import networkx as nx
from multiprocessing import Pool, cpu_count
from unifier import apply_match_pcfs, PCF


# This script corresponds to the coboundary graph growing algorithm (Appendix B.1)
# `apply_match_pcfs` corresponds to the matching algorithm (Appendix B.2)


SAVE_DIR = 'coboundary_graphs'
PCFS = "dataset/pcfs.pkl"
CMF_PCFS = "dataset/cmf_pcfs_compact.pkl"   # compact version of cmf_pcfs.pkl to reduce runtime
STARTING_GRAPH = ''                         # fill in if a previous run was interrupted
CMF_ATTEMPTED = ''                          # fill in if a previous run was interrupted
START_DELTA = -1.00                         # delta to start from
END_DELTA = 0.05                            # delta to end at
DELTA_GRANULARITY = 0.05                    # cluster shift
DELTA_SIMILARITY_THRESHOLD = 0.03           # threshold for delta similarity
NUM_WORKERS = max(4, cpu_count() - 6)       # CONFIGURE PER YOUR MACHINE: number of workers for multiprocessing
VERBOSE = True


def round_convrate(x):
    if abs(x) <= 0.05:
        x = 0
    return x


def explore_edge(args):
    edge_attempt_id, a1, b1, lim1, convrate1, a2, b2, lim2, convrate2 = args
    if VERBOSE:
        print(f'    Starting edge attempt #{edge_attempt_id}... {a1}, {b1} -> {a2}, {b2}')
    pcf = PCF(sp.sympify(a1), sp.sympify(b1))
    limit = sp.sympify(lim1)
    convrate = convrate1
    pcf2 = PCF(sp.sympify(a2), sp.sympify(b2))
    limit2 = sp.sympify(lim2)
    convrate2 = convrate2
    try:
        transformation = apply_match_pcfs(pcf, pcf2, limit, limit2, convrate, convrate2)
        return (a1, b1), (a2, b2), transformation, edge_attempt_id
    except Exception as e:
        return (a1, b1), (a2, b2), None, edge_attempt_id


if __name__  == "__main__":

    with open(PCFS, 'rb') as f:
        pcfs = pickle.load(f)
    pcfs = pcfs[pcfs.apply(lambda x: 'pi' in str(x['limit']), axis=1)]
    pcfs = pcfs.sort_values(by=['a', 'b'], key=lambda x: x.str.len())

    with open(CMF_PCFS, 'rb') as f:
        cmf_pcfs = pickle.load(f)
    cmf_pcfs = cmf_pcfs[cmf_pcfs.apply(lambda x: 'pi' in str(x['limit']), axis=1)]
    cmf_pcfs = cmf_pcfs.sort_values(by=['a', 'b'], key=lambda x: x.str.len())

    if CMF_ATTEMPTED:
        with open(CMF_ATTEMPTED, 'rb') as f:
            cmf_attempted = pickle.load(f)

    print('Initializing graph...')
    if STARTING_GRAPH and STARTING_GRAPH is not None:
        if os.path.exists(STARTING_GRAPH):
            print('    Loading existing graph...')
            with open(STARTING_GRAPH, 'rb') as f:
                graph = pickle.load(f)
        else:
            raise ValueError('Graph file not found.')

        attempted = list([set(edge) for edge in graph.edges()])
        nonhubs = [v for u, v, data in graph.edges(data=True) if data['transformation'] is not None]
        edge_attempt_number = len(graph.edges())
        edge_number = len(nonhubs)

    else:
        graph = nx.DiGraph()
        for i, row in pcfs.iterrows():
            graph.add_node(row['ab'], a=row['a'], b=row['b'], limit=row['limit'], sources=row['sources'],
                           delta=row['delta'], convergence_rate=row['convergence_rate'])

        attempted = []
        nonhubs = []
        edge_attempt_number = 0
        edge_number = 0

    last_saved_at_edge_attempt_number = edge_attempt_number

    for delta in np.arange(START_DELTA, END_DELTA, DELTA_GRANULARITY):
        delta = round(delta, 3)
        print(f'Starting delta={delta}...')
        candidates = pcfs[pcfs['delta'].apply(lambda x: abs(x - delta) < DELTA_SIMILARITY_THRESHOLD)]
        print('    candidates:', len(candidates))

        for i, (ind, row) in enumerate(candidates.iterrows()): # hub
            ab = row['ab']
            if ab in nonhubs:
                continue

            args_list = []

            for j, row2 in candidates.iloc[i+1:].iterrows():
                if row2['ab'] in nonhubs or {row['ab'], row2['ab']} in attempted:
                    continue

                edge_attempt_number += 1

                args_list.append(
                    (edge_attempt_number,
                     row['a'], row['b'], row['limit'], round_convrate(float(row['convergence_rate'])),
                     row2['a'], row2['b'], row2['limit'], round_convrate(float(row2['convergence_rate'])))
                )

            with Pool(processes=NUM_WORKERS) as pool:
                results = pool.map(explore_edge, args_list)

            for ab, ab2, transformation, edge_attempt_id in sorted(results, key=lambda x: x[3]):
                if transformation is not None:
                    edge_number += 1
                    graph.add_edge(ab, ab2, transformation=transformation, edge_number=edge_number, edge_attempt=edge_attempt_id)
                    nonhubs.append(ab2)
                else:
                    graph.add_edge(ab, ab2, transformation=None, edge_attempt=edge_attempt_id)
                attempted.append({ab, ab2})

            if (edge_attempt_number - last_saved_at_edge_attempt_number >= 20 or i == len(candidates) - 1) \
                and edge_attempt_number - last_saved_at_edge_attempt_number: # make sure there are new edges
                print('    edge attempts:', edge_attempt_number, '    matches (nonhubs):', len(nonhubs))
                print('    Saving current graph...')
                os.makedirs(SAVE_DIR, exist_ok=True)
                with open(os.path.join(SAVE_DIR, f'graph_{START_DELTA}_to_{delta}.pkl'), 'wb') as f:
                    pickle.dump(graph, f)
                last_saved_at_edge_attempt_number = edge_attempt_number

    print('edge attempts:', edge_attempt_number, '    matches (nonhubs):', len(nonhubs))
    print('Saving current graph...')
    os.makedirs(SAVE_DIR, exist_ok=True)
    with open(os.path.join(SAVE_DIR, f'graph_{START_DELTA}_to_{delta}_before_cmf_pcfs.pkl'), 'wb') as f:
        pickle.dump(graph, f)
    final_delta = delta

    print('Starting matching to CMF PCFs...')
    print('Total hub PCFs:', len(pcfs) - len(nonhubs))

    cmf_pcf_abs = list(cmf_pcfs['ab'].values)
    if not CMF_ATTEMPTED:
        cmf_attempted = [] # cmf-non cmf "edges" that have been attempted
    last_saved_at_cmf_attempted = len(cmf_attempted)
    nontrivial_hubs_attempted = 0 # nontrivial hubs for which we have attempted to find a match
    trivial_hubs = 0

    for i, (ind, row) in enumerate(pcfs.iterrows()): # hub
        ab = row['ab']
        delta = round(row['delta'], 3)

        # even for nonhubs, if the pcf is in cmf_pcfs,
        # make this node a cmf_pcf node (add 'cmf_sources' attribute)
        # we will find the total unification count later
        # by checking how many nodes with attribute 'sources' (reserved for non-cmf pcfs)
        # appear in connected components that have a cmf_pcf node

        trivial = False
        if ab in cmf_pcf_abs:
            trivial = True
            graph.nodes[ab]['cmf_sources'] = cmf_pcfs[cmf_pcfs['ab'] == ab].iloc[0]['sources']
        if ab in nonhubs:
            # trivial or not, but not a hub
            continue
        elif trivial:
            trivial_hubs += 1
            continue

        nontrivial_hubs_attempted += 1
        print(f"    starting hub {nontrivial_hubs_attempted + trivial_hubs} ({round(delta, 2)}): {ab}")

        candidates = cmf_pcfs[cmf_pcfs['delta'].apply(lambda x: abs(x - delta) < 0.03)]

        print('        candidates:', len(candidates))

        # split candidates into batches of NUM_WORKERS
        # match_found = False
        # multiprocess batches
        #     when a batch with a successful match is found,
        #     match_found=True, break
        # create a node for the cmf pcf if it doesn't exist
        # create 'cmf_sources' attribute for this node (NOT 'sources')

        args_list = []

        graph_edges = list(graph.edges())
        for j, row2 in candidates.iterrows():
            if {ab, row2['ab']} in attempted or (row2['ab'], ab) in cmf_attempted \
                or (row2['ab'], ab) in graph_edges:
                # removed this condition: row2['ab'] in nonhubs
                continue

            edge_attempt_number += 1

            args_list.append(
                (edge_attempt_number,
                 row2['a'], row2['b'], row2['limit'], round_convrate(float(row2['convergence_rate'])),
                 row['a'], row['b'], row['limit'], round_convrate(float(row['convergence_rate'])))
            )

        args_list = sorted(args_list, key=lambda x: (len(x[1]), len(x[2]))) # sort by length of cmf pcfs' a, b
        match_found = False

        for b in range(0, len(args_list), NUM_WORKERS):
            if match_found:
                break
            args_list_batch = args_list[b:b + NUM_WORKERS]

            with Pool(processes=NUM_WORKERS) as pool:
                results = pool.map(explore_edge, args_list_batch)

            for ab2, ab, transformation, _ in sorted(results, key=lambda x: (len(x[0][0]), len(x[0][1]))):
                # key for sorting: length of cmf pcfs' a, b
                if match_found:
                    break
                cmf_attempted.append((ab2, ab))

                if transformation is not None:
                    match_found = True
                    edge_number += 1

                    row2 = candidates[candidates['ab'] == ab2].iloc[0]
                    graph.add_node(ab2, a=row2['a'], b=row2['b'], limit=row2['limit'], cmf_sources=row2['sources'],
                                   delta=row2['delta'], convergence_rate=row2['convergence_rate'])
                    graph.add_edge(ab2, ab, transformation=transformation, edge_number=edge_number)

            if len(cmf_attempted) - last_saved_at_cmf_attempted:
                with open(os.path.join(SAVE_DIR, f'cmf_attempted.pkl'), 'wb') as f:
                    pickle.dump(cmf_attempted, f)
                last_saved_at_cmf_attempted = len(cmf_attempted)

        if match_found:
            print(f'        found match for {ab}!')
            print(f'        saving current graph')
            with open(os.path.join(SAVE_DIR, f'graph_{START_DELTA}_to_{final_delta}_matched_to_cmf_pcfs.pkl'), 'wb') as f:
                pickle.dump(graph, f)
        else:
            print(f'        no match found for {ab}')