provexa-app.github.io/index.html at main · provexa-app/provexa-app.github.io · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
<!DOCTYPE html>
<html>
<head>
  <meta charset="utf-8">
  <meta name="description"
        content="Provexa is an interactive investigation tool that enables precise, pattern-based analysis of system audit logs to uncover complex cyberattacks at scale.">
  <meta name="keywords" content="Provexa, Cybersecurity, Attack Investigation, System Audit Logs, Human-in-the-Loop">
  <meta name="viewport" content="width=device-width, initial-scale=1">
  <title>Provexa: Enabling Efficient Attack Investigation via Human-in-the-Loop Security Analysis</title>

  <link href="https://fonts.googleapis.com/css?family=Google+Sans|Noto+Sans|Castoro"
        rel="stylesheet">

  <link rel="stylesheet" href="./static/css/bulma.min.css">
  <link rel="stylesheet" href="./static/css/bulma-carousel.min.css">
  <link rel="stylesheet" href="./static/css/bulma-slider.min.css">
  <link rel="stylesheet" href="./static/css/fontawesome.all.min.css">
  <link rel="stylesheet"
        href="https://cdn.jsdelivr.net/gh/jpswalsh/academicons@1/css/academicons.min.css">
  <link rel="stylesheet" href="./static/css/index.css">
  <link rel="icon" href="./static/images/provexa-logo.png">

  <script src="https://ajax.googleapis.com/ajax/libs/jquery/3.5.1/jquery.min.js"></script>
  <script defer src="./static/js/fontawesome.all.min.js"></script>
  <script src="./static/js/bulma-carousel.min.js"></script>
  <script src="./static/js/bulma-slider.min.js"></script>
  <script src="./static/js/index.js"></script>
</head>
<body>

<nav class="navbar" role="navigation" aria-label="main navigation">
  <div class="navbar-brand">
    <a role="button" class="navbar-burger" aria-label="menu" aria-expanded="false">
      <span aria-hidden="true"></span>
      <span aria-hidden="true"></span>
      <span aria-hidden="true"></span>
    </a>
  </div>
</nav>


<section class="hero">
  <div class="hero-body">
    <div class="container is-max-desktop">
      <div class="columns is-centered">
        <div class="column has-text-centered">
          <img src="./static/images/provexa.png" alt="Provexa Logo" style="max-width:150px; margin-bottom: 20px;">
          <h3 class="title is-1 publication-title" style="font-size: 2.9rem;">
            <span class="dnerf">Provexa</span>: Enabling Efficient Attack Investigation via Human-in-the-Loop Security Analysis
          </h3>
          <div class="is-size-5 publication-authors">
            <span class="author-block">
              <a target="_blank" href="https://49simoney.vercel.app">Saimon Amanuel Tsegai</a><sup style="color:#ff6600;">1</sup>,
            </span>
            <span class="author-block">
              <a href="#">Xinyu Yang</a><sup style="color:#ff6600;">1</sup>,</span>
            <span class="author-block">
                <a href="#">Haoyuan Liu</a><sup style="color:#003262;">2</sup>,
            </span>
            <span class="author-block">
              <a target="_blank" href="https://people.cs.vt.edu/penggao/">Peng Gao</a><sup style="color:#ff6600;">1</sup>
            </span>
          </div>

          <div class="is-size-5 publication-authors">
            <br>
            <span class="author-block"><sup style="color:#ff6600;">1</sup>Virginia Tech</span>
            <span class="author-block" style="padding: 0 10px;"></span>
            <span class="author-block"><sup style="color:#003262;">2</sup>University of California, Berkeley</span>
          </div>

          <div class="column has-text-centered">
            <div class="publication-links">
              <!-- PDF Link. -->
              <span class="link-block">
                <a target="_blank" href="https://www.vldb.org/pvldb/vol18/p3771-gao.pdf"
                   class="external-link button is-normal is-rounded is-dark">
                  <span class="icon">
                      <i class="fas fa-file-pdf"></i>
                  </span>
                  <span>Paper</span>
                </a>
              </span>
              <!-- Video Link. -->
              <span class="link-block">
                <a target="_blank" href="https://youtu.be/onPvk2F8GIM"
                   class="external-link button is-normal is-rounded is-dark">
                  <span class="icon">
                      <i class="fab fa-youtube"></i>
                  </span>
                  <span>Video</span>
                </a>
              </span>
              <!-- Code Link. -->
              <span class="link-block">
                <a target="_blank" href="https://github.com/peng-gao-lab/Provexa"
                   class="external-link button is-normal is-rounded is-dark">
                  <span class="icon">
                      <i class="fab fa-github"></i>
                  </span>
                  <span>Code</span>
                  </a>
              </span>
            </div>
          </div>
        </div>
      </div>
    </div>
  </div>
</section>

<!-- News/Announcement Section -->
<section class="section" style="padding-top: 1rem; padding-bottom: 1rem;">
  <div class="container is-max-desktop">
    <div class="columns is-centered">
      <div class="column is-four-fifths has-text-centered">
        <h2 class="title is-4" style="margin-bottom: 1rem;">News</h2>

        <div class="notification is-info is-light" style="border-left: 4px solid #3273dc;">
          <div class="content has-text-left">
            <ul style="margin: 0; padding-left: 1rem; font-size: 1.25rem;">
              <li>
                <strong>🎉 June 2025:</strong> This paper has been accepted at <strong>VLDB 2025</strong>!
              </li>
            </ul>
          </div>
        </div>
      </div>
    </div>
  </div>
</section>

<section class="section">
  <div class="container" style="margin-bottom: 2vh;">
    <div class="columns is-centered has-text-centered">
      <div class="column is-four-fifths">
        <h2 class="title is-3">Overview</h2>
        <!-- <h1 class="title is-3" style="font-size: xxx-large;">Overview</h1> -->
        <div class="content has-text-justified">
          <div style="display: flex; justify-content: center;">
            <img src="./static/images/provexa-arch.jpg" alt="Provexa Architecture" style="max-width:70%; height:auto;"/>
          </div>
          <br>
          <p>
            <span class="dnerf"><strong>Provexa</strong></span> is a human-in-the-loop investigation platform designed to uncover sophisticated, multi-stage cyberattacks by analyzing massive volumes of system audit data. At its core lies <strong>ProvQL</strong>, a powerful domain-specific language tailored for security analysts working over <em>system provenance graphs</em>.
          </p>
          <p>
            The architecture comprises lightweight <strong>system agents</strong> that collect OS-level events (file, process, and network interactions), which are then parsed and stored in <strong>graph or relational databases</strong>. These events are transformed into provenance graphs where nodes represent system entities and edges represent causal event relationships.
          </p>
          <p>
            Analysts interact with Provexa via a <strong>notebook-style UI</strong> that enables constructing queries, visualizing results, and progressively refining hypotheses. Two core primitives support investigation:
          </p>
          <ul>
            <li><strong>Attack Pattern Search</strong>: Search for suspicious multi-event behavior patterns (e.g., data exfiltration, remote access).</li>
            <li><strong>Causal Dependency Tracking</strong>: Uncover chains of events leading to or resulting from an attack indicator.</li>
          </ul>
          <p>
            The <strong>domain-aware query engine</strong> intelligently schedules subqueries for optimized performance, and an <strong>in-memory management</strong> supports fast, iterative analysis. This architecture empowers analysts to stay focused on what matters — surfacing relevant attack behaviors without wading through irrelevant noise.
          </p>
        </div>
      </div>
    </div>
  </div>
</section>

<section class="section">
  <div class="container" style="margin-bottom: 2vh;">
    <div class="columns is-centered has-text-centered">
      <div class="column is-four-fifths">
        <h2 class="title is-3">Motivation</h2>
        <!-- <h1 class="title is-3" style="font-size: xxx-large;">Motivation</h1> -->
        <div class="content has-text-justified">
          <div style="display: flex; justify-content: center;">
            <img src="./static/images/demo.jpg" alt="Provexa Demo" style="max-width:100%; height:auto;"/>
          </div>
          <p>
            <strong>System dependency graphs for a multi-stage, multi-host data leakage attack.</strong> The combined dependency
            graphs of the two victim hosts contain 100,524 nodes and 154,353 edges. The attack-relevant nodes
            and edges, highlighted in dark black, comprise only 20 nodes and 20 edges, indicating the significant
            challenge of finding <b>a needle in a haystack</b>.
          </p>
          <p>
            Sophisticated cyberattacks like APTs unfold over multiple hosts and stages, often blending into normal activity. Traditional forensic tools fail to cope due to:
          </p>
          <ul>
            <li><strong>Massive event volume</strong>: Millions of daily system events overwhelm analysts.</li>
            <li><strong>Dependency explosion</strong>: Causal tracing from a single alert brings in thousands of irrelevant nodes.</li>
            <li><strong>Lack of analyst control</strong>: Existing systems don't support iterative refinement or domain-informed filtering.</li>
          </ul>
          <p>
            <strong><span class="dnerf">Provexa</span> solves this.</strong>
            We demonstrate how ProvQL is used to investigate the data leakage case in the figure above.
          </p>

          <!-- Side-by-side ordered list layout -->
          <div class="side-by-side-list">
            <!-- Row 1: Items 1 and 2 -->
            <div class="list-row">
              <div class="list-item">
                <div class="item-number">1</div>
                <div class="item-content">
                  We search Host 1 for a process (<code>curl</code>) that reads a <code>*.tar</code> file and immediately sends it over the network. The result from the search confirms data exfiltration involving <code>sensitive_data.tar</code>. The result is stored in poi1 for further analysis.
                  <div class="code-block">
                    poi1 = <span class="keyword">search</span> <span class="keyword">from</span> <span class="function">db</span>(<span class="string">host1</span>) where<br>
                    &nbsp;&nbsp;e1{<span class="string">name="curl"</span>, <span class="string">type=process</span>},<br>
                    &nbsp;&nbsp;e2{<span class="string">name="*.tar"</span>, <span class="string">type=file</span>},<br>
                    &nbsp;&nbsp;e3{<span class="string">type=network</span>}<br>
                    &nbsp;&nbsp;with e2[read] → e1 && [<1s]<br>
                    &nbsp;&nbsp;e1[write] → e3;<br>
                    <span class="keyword">display</span> poi1;
                  </div>
                </div>
              </div>
              <div class="list-item">
                <div class="item-number">2</div>
                <div class="item-content">
                  We backtrack <code>poi1</code> on Host 1 to find the origin of the <code>sensitive_data.tar</code> file, filtering out benign processes like <code>vscode</code>. The results reveal that an <code>scp</code> process created the file by copying it from Host 2, confirming remote data transfer.
                  <div class="code-block">
                    g1 = <span class="keyword">back track</span> <span class="function">poi1</span> <span class="keyword">from</span> <span class="function">db</span>(<span class="string">host1</span>) <span class="keyword">exclude</span> nodes where name <span class="keyword">like</span> <span class="string">"vscode"</span>;<br>
                    <span class="keyword">display</span> g1;
                  </div>
                </div>
              </div>
            </div>

            <!-- Row 2: Items 3 and 4 -->

            <div class="list-row">
              <div class="list-item">
                <div class="item-number">3</div>
                <div class="item-content">
                  <p>We backtrack the creation of <code>sensitive_data.tar</code> on Host 2. The query reveals a <code>tar</code> process packed <code>/etc/passwd</code> and <code>/etc/shadow</code> into the archive, confirming the data collection phase of the attack.</p>
                  <div class="code-block">
                    g2 = <span class="keyword">back track</span> <span class="string">"sensitive_data.tar"</span> <span class="keyword">from</span> <span class="function">db</span>(<span class="string">host2</span>)<br>
                    &nbsp;&nbsp;<span class="keyword">exclude</span> nodes where name <span class="keyword">like</span> <span class="string">"vscode"</span>;<br>
                    <span class="keyword">display</span> g2;
                  </div>
                </div>
              </div>
              <div class="list-item">
                <div class="item-number">4</div>
                <div class="item-content">
                  <p>We trace backward from the <code>curl</code> process on Host 1 to uncover the attack's entry point. Non-critical activity like <code>ping</code> is excluded. The resulting graph, stored in <code>g3</code>, helps isolate the origin of the malicious process.</p>
                  <div class="code-block">
                    g3 = <span class="keyword">back track</span> where exename <span class="keyword">like</span> <span class="string">"curl"</span> <span class="keyword">from</span> <span class="function">db</span>(<span class="string">host1</span>)
                    <span class="keyword">include</span> nodes where <span class="keyword">not</span> path <span class="keyword">like</span> <span class="string">"ping"</span>;<br>
                    <span class="keyword">display</span> g3;
                  </div>
                </div>
              </div>
            </div>

            <!-- Row 3: Items 5 and 6 -->

            <div class="list-row">
              <div class="list-item">
                <div class="item-number">5</div>
                <div class="item-content">
                  <p>We search the in-memory graph <code>g3</code> for events involving the attacker’s IP <code>20.69.152.188</code>. The query reveals a <code>lighttpd</code> process that reads from this IP, suggesting the attacker exploited a web server vulnerability to compromise Host 1. The result is saved in <code>poi2</code>.</p>
                  <div class="code-block">
                    poi2 = <span class="keyword">search</span> <span class="keyword">from</span> <span class="function">g3</span> where <br>
                    &nbsp;&nbsp;e1{<span class="string">srcip="20.69.152.188"</span>},<br>
                    &nbsp;&nbsp;e2{<span class="string">type=process</span>} <br>
                    &nbsp;&nbsp;with e1[read] → e2;<br>
                    <span class="keyword">display</span> poi2;
                  </div>
                </div>
              </div>
              <div class="list-item">
                <div class="item-number">6</div>
                <div class="item-content">
                  <p>We merge <code>g1</code> and <code>g3</code> to build a comprehensive view of the attack on Host 1 (<code>g4</code>), then perform a forward trace from the entry point <code>poi2</code> within this graph. Filtering out irrelevant nodes like <code>cat</code>, the resulting trace (<code>g5</code>) captures the critical path from entry to exfiltration.</p>
                  <div class="code-block">
                    g4 = g1 | g3;<br>
                    g5 = <span class="keyword">forward track</span> <span class="function">poi2</span> from <span class="function">g4</span>
                    <span class="keyword">exclude</span> nodes where name <span class="keyword">like</span> <span class="string">"cat"</span>;<br>
                    <span class="keyword">display</span> g5;
                  </div>
                </div>
              </div>
            </div>
          </div>
          <p>
            The dark paths shown in Fig. 1 visualize the output of <code>g5</code>, capturing the key attack steps with minimal noise. This showcases how ProvQL enables efficient, step-by-step investigation of complex, multi-stage attacks.
          </p>
        </div>
      </div>
    </div>
  </div>
</section>

<section class="section">
  <div class="container" style="margin-bottom: 2vh;">
    <div class="columns is-centered has-text-centered">
      <div class="column is-four-fifths">
        <h2 class="title is-3">Key Takeaways</h2>
        <div class="content has-text-justified">
          <ul>
            <li><strong>🔍 Massive Reduction in Noise:</strong> Provexa filters out irrelevant events, achieving significant reduction in provenance graph size and isolating only the attack-relevant paths.</li>

            <li><strong>🧠 Human-Centric Querying:</strong> ProvQL empowers analysts to express high-level patterns and causal hypotheses with precision, while retaining control over filters, constraints, and graph traversal semantics.</li>

            <li><strong>🚀 Complementary Optimization Layer:</strong> Provexa acts as an intelligent pre-processing layer that reduces the query burden on the backend database. The system performs even better when the underlying engine is optimized, leading to lower execution time and reduced running cost.</li>

            <li><strong>🧪 User Study Results:</strong>
              <ul>
                <li>Rated highest in usability, learnability, and task success.</li>
                <li>Users completed investigations with <strong>fewer iterations</strong> and more confidence than with SQL or Cypher.</li>
                <li>Provexa was preferred by nearly all participants for real-world use.</li>
              </ul>
            </li>

            <li><strong>📊 Faster, Iterative Investigations:</strong> Provexa achieves faster query execution through in-memory result management, allowing reuse of intermediate results across queries, a feature not supported in general-purpose query languages. This leads to subsecond response times and supports fluid, iterative analyst workflows.</li>

            <li><strong>📈 Beyond Database Optimization:</strong> Our experiments show that Provexa improves execution time and cost by orders of magnitude beyond what can be achieved through backend database optimizations alone. It serves as a powerful optimization layer that amplifies performance even when the underlying database is fully tuned.</li>
          </ul>
        </div>
      </div>
    </div>
  </div>
</section>
<hr>

<section class="section has-text-centered">
  <div class="container">
    <h2 class="title is-4">Want to Dive Deeper?</h2>
    <p class="content">
For more details on attack scenarios, example queries, and experimental results, check our full appendix page:
    </p>
    <a class="button is-link is-medium" href="additional-details.html">
      View Full Appendix →
    </a>
  </div>
</section>


<section class="section">
  <div class="container" style="margin-bottom: 2vh;">
    <div class="columns is-centered has-text-centered">
      <div class="column is-four-fifths">
        <h2 class="title is-3">BibTex</h2>
        <div class="content has-text-justified">
      <pre><code>
@article{tsegai2025provexa,
    title = {Enabling Efficient Attack Investigation via Human-in-the-Loop Security Analysis},
    author = {Tsegai, Saimon Amanuel and Yang, Xinyu and Liu, Haoyuan and Gao, Peng},
    year = {2025},
    issue_date = {July 2025},
    journal = {Proceedings of the VLDB Endowment},
    volume = {18},
    number = {11},
    doi = {10.14778/3749646.3749653},
    pages = {3771-3783},
}
          </code></pre>
        </div>
      </div>
    </div>
</div>
</section>

<section>
  <div id="org-banners" style="display: flex; justify-content: center; align-items: center; gap: 40px; padding-bottom: 30px;">
    <a href="https://www.vt.edu/" target="_blank" rel="external">
      <img class="center-block org-banner" src="static/images/vt-logo.jpg" alt="Virginia Tech Logo" style="height: 60px; width: auto; object-fit: contain; display: block;">
    </a>
  </div>
</section>

<footer class="footer">
  <div class="container">
    <div class="columns is-centered">
      <div class="column is-8">
        <div class="content">
          <p>
            This website is website adapted from <a href="https://nerfies.github.io">Nerfies</a>, licensed under a <a rel="license"
                                                href="http://creativecommons.org/licenses/by-sa/4.0/">Creative
            Commons Attribution-ShareAlike 4.0 International License</a>.
          </p>
        </div>
      </div>
    </div>
  </div>
</footer>

</body>
</html>