Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
37 changes: 37 additions & 0 deletions dashboard/Dockerfile
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
# Adds User stats report to prebuilt NGB image
FROM ngb:latest
# FROM ubuntu:16.04

ARG ANACONDA_VERSION="3-latest"
ENV ANACONDA_HOME=/opt/local/anaconda
ENV PATH="$PATH:$ANACONDA_HOME/bin"

ENV DASHBOARD_HOME=/opt/dashboard
ADD dashboard/ "$DASHBOARD_HOME/dashboard/"
ADD parser/ "$DASHBOARD_HOME/parser/"
ADD requirements.txt "$DASHBOARD_HOME/"
ADD start.sh "$DASHBOARD_HOME/"

RUN mkdir -p "$ANACONDA_HOME" && \
wget -q "https://repo.anaconda.com/miniconda/Miniconda${ANACONDA_VERSION}-Linux-x86_64.sh" -O /tmp/Anaconda_Install.sh --no-check-certificate && \
bash /tmp/Anaconda_Install.sh -f -b -p "$ANACONDA_HOME" && \
rm -f /tmp/Anaconda_Install.sh && \
conda init bash && \
. $ANACONDA_HOME/etc/profile.d/conda.sh && \
conda create -y -n dashboard python==3.10.14 && \
conda activate dashboard && \
pip3 install -r "$DASHBOARD_HOME/requirements.txt"

# Set up parser
ENV NGB_LOG_FOLDER=/opt/ngb/logs/
ENV NGB_STATS_FILE="/opt/ngb/data/stats/ngb-user-stats.parquet"
ENV NGS_LOG_SYNC_TIMESTAMP="/opt/ngb/data/stats/sync.txt"
ENV CRON_FILE="$DASHBOARD_HOME/parser.sh"
ENV NGB_DASHBOARD_URL_PATH="/dashboard/"

RUN apt-get install -y cron && \
echo "0 1 * * * export NGB_LOG_FOLDER="$NGB_LOG_FOLDER"; export NGB_STATS_FILE="$NGB_STATS_FILE"; export NGS_LOG_SYNC_TIMESTAMP="$NGS_LOG_SYNC_TIMESTAMP"; . $ANACONDA_HOME/etc/profile.d/conda.sh && conda activate dashboard && cd $DASHBOARD_HOME/parser && python3 log_parser.py >> $DASHBOARD_HOME/parser.log 2>&1\n" >> "$CRON_FILE" && \
chmod 0644 "$CRON_FILE" && \
crontab "$CRON_FILE"


196 changes: 196 additions & 0 deletions dashboard/dashboard/app.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,196 @@
import os

from dash import Dash, html, dash_table, dcc, callback, Output, Input
import pandas as pd
import plotly.express as px
import dash_bootstrap_components as dbc
from datetime import date, timedelta

STYLE_HIDE = {'display': 'none'}
STYLE_SHOW = {'display': 'block'}
LAST_WEEK = 'Last week'
LAST_MONTH = 'Last month'
LAST_YEAR = 'Last year'
THIS_YEAR = 'This year'
CUSTOM_PERIOD = 'Custom period'

data_file = os.getenv('NGB_STATS_FILE')
if not data_file or not os.path.exists(data_file):
raise ValueError('Missing user activity data')
df = pd.read_parquet(data_file)
df['date'] = pd.to_datetime(df['date'])
df = df.reset_index(drop=True)

external_stylesheets = [dbc.themes.BOOTSTRAP]
app = Dash(__name__, external_stylesheets=external_stylesheets,
url_base_pathname=os.getenv('NGB_DASHBOARD_URL_PATH', '/'),
title="NGB Usage Report")

dropdown = dcc.Dropdown(
id='timeframe_dropdown',
multi=False,
options=[
{'label': LAST_WEEK, 'value': LAST_WEEK},
{'label': LAST_MONTH, 'value': LAST_MONTH},
{'label': LAST_YEAR, 'value': LAST_YEAR},
{'label': THIS_YEAR, 'value': THIS_YEAR},
{'label': CUSTOM_PERIOD, 'value': CUSTOM_PERIOD}
],
value=THIS_YEAR,
clearable=False,
style={"width": "150px", "margin-right": "15px"}
)


date_picker = dcc.DatePickerRange(
id='date-picker-range',
min_date_allowed=date(2020, 1, 1),
initial_visible_month=date.today(),
persistence=True
)

# App layout
app.layout = dbc.Container([
dbc.Row([
html.Div('NGB User Statistics', style={"font-weight": "bold", "font-size": "large", "margin-bottom": "10px"})
]),

html.Div([dropdown, date_picker], style={"display": "flex", "align-items": "center"}),

html.Table(className='table',
children=
[
html.Tr([html.Td(id='unique-users-cell', style={'width': '200px'}),
html.Td(id='number-logins-cell', style={'width': '200px'}),
html.Td(id='active-users-cell', style={'width': '200px'})]),
], style=STYLE_SHOW, id='total-stat-table'),

dbc.Row([
dbc.Col([
dcc.Graph(figure={}, id='user-activity-chart')
]),
], style=STYLE_SHOW, id='chart-container'),

dbc.Row([
dbc.Col([
html.Label("Logins per user"),
dash_table.DataTable(data=[], page_size=12, style_table={'overflowX': 'auto'},
sort_action='native', id='user-table', export_format='csv')
]),
], style=STYLE_SHOW, id='user-table-container'),

], fluid=True)


def previous_week_end(input_date):
weekday_index = input_date.isoweekday()
# Sunday case
delta = 7 if weekday_index == 7 else 0
return input_date - timedelta(days=((input_date.isoweekday()) % 7 + delta))


def previous_week(input_date):
end = previous_week_end(input_date)
return end - timedelta(days=6), end


def get_previous_month(input_date):
first_day_of_current_month = input_date.replace(day=1)
end = first_day_of_current_month - timedelta(days=1)
return end.replace(day=1), end


def get_previous_year(input_date):
previous = input_date.replace(year=input_date.year - 1)
return previous.replace(month=1, day=1), previous.replace(month=12, day=31)


def get_this_year(input_date):
return input_date.replace(month=1, day=1), input_date.replace(month=12, day=31)


@callback(
Output('date-picker-range', 'start_date'),
Output('date-picker-range', 'end_date'),
Input('timeframe_dropdown', 'value')
)
def update_date(value):
today = date.today()
start = ''
end = ''
if value == LAST_WEEK:
start, end = previous_week(today)
if value == LAST_MONTH:
start, end = get_previous_month(today)
if value == LAST_YEAR:
start, end = get_previous_year(today)
if value == THIS_YEAR:
start, end = get_this_year(today)
if value == CUSTOM_PERIOD:
return None, None
return str(start), str(end)

@callback(
Output('user-activity-chart', 'figure'),
Output('user-table', 'data'),
Output('chart-container', 'style'),
Output('total-stat-table', 'style'),
Output('user-table-container', 'style'),
Output('unique-users-cell', 'children'),
Output('number-logins-cell', 'children'),
Output('active-users-cell', 'children'),
Input('date-picker-range', 'start_date'),
Input('date-picker-range', 'end_date'))
def update_output(start_date, end_date):
if not start_date or not end_date:
return {}, [], STYLE_HIDE, STYLE_HIDE, STYLE_HIDE, '', '', ''
start_date_object = pd.to_datetime(start_date)
end_date_object = pd.to_datetime(end_date)
data = df[(df['date'] >= start_date_object) & (df['date'] <= end_date_object)]

graph_data = data.groupby("date").agg(
NumerOfUsers=pd.NamedAgg(column="user", aggfunc="count"),
NumberOfLogins=pd.NamedAgg(column="logins", aggfunc="sum"),
ActiveTime=pd.NamedAgg(column="duration", aggfunc="sum")
)
graph_data['date'] = graph_data.index
fig = px.line(graph_data, x="date", y="NumerOfUsers", title='Users per day',
labels={"NumerOfUsers": "Users count", "date": ""},
template='plotly_white', markers=True)
fig.update_layout(
yaxis=dict(
tickmode='linear',
tick0=0,
dtick=1
)
)
fig.update_xaxes(fixedrange=False)
fig.update_traces(line_color='#000000')

processed = data.groupby("user").agg(
User=pd.NamedAgg(column="user", aggfunc="first"),
Email=pd.NamedAgg(column="email", aggfunc="first"),
LastLogin=pd.NamedAgg(column="date", aggfunc="max"),
NumberOfLogins=pd.NamedAgg(column="logins", aggfunc="sum"),
ActiveTime=pd.NamedAgg(column="duration", aggfunc="sum")
)
processed['LastLogin'] = pd.to_datetime(processed['LastLogin']).dt.date
processed = processed.rename(columns={'LastLogin': 'Last Login Date',
'NumberOfLogins': 'Number of Logins',
'ActiveTime': 'Duration Used (min)'})

unique_users = len(data['user'].unique())
total_logins = data['logins'].sum()
active_users_data = data[(df['date'] >= end_date_object - timedelta(weeks=8))]
active_users = len(active_users_data['user'].unique())

return fig, processed.to_dict('records'), \
STYLE_SHOW, STYLE_SHOW, STYLE_SHOW, \
'Unique Users: %d' % unique_users, \
'Number of Logins: %d' % total_logins, \
'Number of Active Users: %d' % active_users


# Run the app
if __name__ == '__main__':
app.run()
4 changes: 4 additions & 0 deletions dashboard/dashboard/assets/style.css
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
.DateRangePickerInput{
border: 0px;
background-color: transparent;
}
132 changes: 132 additions & 0 deletions dashboard/parser/log_parser.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,132 @@
import json
import os
import pandas as pd
import datetime

DATE_FORMAT = "%Y-%m-%d"
LOG_PREFIX = 'requests.log.'
ANONYMOUS = 'Anonymous User'
DEFAULT_SESSION = 'default'


def parse_message(data, timestamp, skip_non_authorized):
trimmed = data[len('After request ['):-1]
chunks = trimmed.split(';')
result = {'timestamp': datetime.datetime.strptime(timestamp, "%d/%m/%Y %H:%M:%S")}
for chunk in chunks:
if chunk.startswith('session'):
result['session'] = chunk.split('=')[1]
if chunk.startswith('user'):
result['user'] = chunk.split('=')[1]
if 'user' not in result and not skip_non_authorized:
result['user'] = ANONYMOUS
result['session'] = DEFAULT_SESSION
return result


def calculate_stats(entry):
grouped = entry.groupby('session').agg(
session_start=pd.NamedAgg(column="timestamp", aggfunc="min"),
session_end=pd.NamedAgg(column="timestamp", aggfunc="max"),
)
grouped['duration'] = grouped['session_end'] - grouped['session_start']
return grouped['duration'].sum()


def process_dataset(df):
result = df.groupby("user").agg(
date=pd.NamedAgg(column="timestamp", aggfunc="max"),
logins=pd.NamedAgg(column="session", aggfunc=pd.Series.nunique),
)
result['duration'] = df.groupby("user").apply(lambda x: calculate_stats(x))
result['email'] = result.index
result['user'] = result['email'].apply(lambda s: s.split('@')[0])
result['date'] = pd.to_datetime(result['date']).dt.date
result['duration'] = result['duration'].apply(lambda x: x.total_seconds() / 60.0).round(2)
return result


def parse_logs(log_folder, output, last_sync, sync_token, skip_non_authorized):
data = None
last_sync_date = None
for file in os.listdir(log_folder):
if file.startswith(LOG_PREFIX):
file_date = datetime.datetime.strptime(file[len(LOG_PREFIX):], "%Y-%m-%d")
if last_sync:
if file_date <= last_sync:
continue
print('Reading file ' + file)
with open(os.path.join(log_folder, file)) as log:
items = []
for line in log.readlines():
line = line.strip()
if line:
if 'payload' in line:
index = line.index('payload')
line = line[:index] + '"}'
try:
parsed = json.loads(line)
timestamp = parsed['debug_timestamp']
entry = parse_message(parsed['debug_message'], timestamp, skip_non_authorized)
if 'user' not in entry:
continue
items.append(entry)
except BaseException as e:
pass
if len(items) == 0:
continue
chunk = process_dataset(pd.DataFrame.from_dict(items))
if data is None:
data = chunk
else:
data = pd.concat([data, chunk])
if last_sync_date is None:
last_sync_date = file_date
else:
if file_date > last_sync_date:
last_sync_date = file_date
print("Finished reading request data from %s. Processed %d entries. Last synchronized date is %s."
% (log_folder, 0 if data is None else data.size, last_sync_date))

if os.path.exists(output):
previous_data = pd.read_parquet(output)
result = pd.concat([previous_data, data])
else:
result = data

if result is not None:
print('Saving usage statistics to %s.' % output)
result.to_parquet(output)

if sync_token and last_sync_date:
with open(sync_token, 'w') as token:
token.write(last_sync_date.strftime(DATE_FORMAT) + "\n")


def read_last_sync(file_path):
if file_path is None or not os.path.exists(file_path):
return None
with open(file_path) as token:
for line in token.readlines():
if line:
try:
return datetime.datetime.strptime(line.strip(), DATE_FORMAT)
except:
pass
return None


if __name__ == '__main__':
folder = os.getenv('NGB_LOG_FOLDER')
output = os.getenv('NGB_STATS_FILE')
skip_non_authorized = os.getenv('NGB_LOG_SKIP_NON_AUTH', 'false').upper() == 'TRUE'
last_sync = os.getenv('NGS_LOG_SYNC_TIMESTAMP', None)
if not folder:
raise ValueError('Log folder not specified')
if not output:
raise ValueError('Output folder not specified')
if not os.path.exists(folder):
raise ValueError('Provided folder %s does not exist' % folder)
if not os.path.exists(os.path.dirname(output)):
os.makedirs(os.path.dirname(output))
parse_logs(folder, output, read_last_sync(last_sync), last_sync, skip_non_authorized)
5 changes: 5 additions & 0 deletions dashboard/requirements.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
dash==2.17.1
numpy==1.24.4
pandas==2.0.3
dash_bootstrap_components==1.6.0
pyarrow==14.0.1
5 changes: 5 additions & 0 deletions dashboard/start.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
cron
. $ANACONDA_HOME/etc/profile.d/conda.sh
conda activate dashboard
cd $DASHBOARD_HOME/dashboard
nohup python3 app.py >> $DASHBOARD_HOME/app.log 2>&1 &