Skip to content

itsDZhang/Stock-Market-Analysis

Folders and files

NameName
Last commit message
Last commit date

Latest commit

 

History

27 Commits
 
 
 
 
 
 

Repository files navigation

Stock Market Analysis

Using pandas, numpy, seaborn, and matplotlib to get stock information, visualize different aspects of it, and analyze the risk of a stock from previous performance history.

Info that would be visualized

  1. Change in price over time
  2. Daily return of the stock market average
  3. Moving average of the various stocks
  4. Correlation between different stocks' closing price
  5. Correlation between different stocks' daily returns
  6. Value that should be put at risk when investing a certain stock
  7. How to predict future stock behaviour
  8. Bootstrap Method
  9. Monte Carlo Method
import pandas as pd
from pandas import Series, DataFrame
import numpy as np

#Visualization
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style('whitegrid')
%matplotlib inline

#For reading stock data 

from pandas_datareader import data, wb
import pandas_datareader as pdr
#For time stamps 
from datetime import datetime 
#Division
from __future__ import division 
#Using Yahoo to grab stock information

tech_stock = ['GOOG','MSFT','AMZN','AAPL']
end = datetime.now()
#Setting start and end date (a year ago now)
start = datetime(end.year - 1, end.month, end.day)
#Setting up Yahoo's financial data as a dataframe 

for stock in tech_stock:
    globals()[stock] = pdr.get_data_yahoo(stock, start,end)
AAPL.describe() #Statistics on Apple stocks 
Open High Low Close Adj Close Volume
count 251.000000 251.000000 251.000000 251.000000 251.000000 2.510000e+02
mean 141.341315 142.295060 140.435937 141.474103 140.677993 2.793747e+07
std 16.902716 16.923112 16.647013 16.736888 17.202225 1.190328e+07
min 106.570000 107.680000 104.080002 105.709999 104.410980 1.147590e+07
25% 130.944999 132.154998 130.784996 131.784996 130.165566 2.067555e+07
50% 144.449997 145.300003 143.449997 144.289993 143.610962 2.533170e+07
75% 154.799995 155.470001 153.805001 154.735001 154.431870 3.221495e+07
max 174.000000 174.259995 171.119995 172.500000 172.500000 1.119850e+08
AAPL.info()
<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 251 entries, 2016-11-07 to 2017-11-03
Data columns (total 6 columns):
Open         251 non-null float64
High         251 non-null float64
Low          251 non-null float64
Close        251 non-null float64
Adj Close    251 non-null float64
Volume       251 non-null int64
dtypes: float64(5), int64(1)
memory usage: 13.7 KB
#Historical view on closing price
AAPL['Adj Close'].plot(legend=True, figsize=(10,4))
<matplotlib.axes._subplots.AxesSubplot at 0x1070d3a10>

png

#Total volume of stock being traded each day over the course of 5 years
AAPL['Volume'].plot(legend=True,figsize=(12,4))
<matplotlib.axes._subplots.AxesSubplot at 0x1a10af4090>

png

#Calculating three moving averages 
#MA = Moving Average
ma_day = [5,10,15,20]

for ma in ma_day:
    column_name = "MA for %s days" %(str(ma))
    AAPL[column_name] = Series.rolling(AAPL['Adj Close'],ma).mean()
AAPL[['Adj Close','MA for 5 days','MA for 10 days','MA for 15 days','MA for 20 days']].plot(subplots=False,figsize=(10,4))
<matplotlib.axes._subplots.AxesSubplot at 0x1a119b6710>

png

Daily Return Analysis

  • Daily Changes of stocks
#Percent Change for each day 

AAPL['Daily Return'] = AAPL['Adj Close'].pct_change()
AAPL['Daily Return'].plot(figsize=(16,4),legend=True,linestyle='--',marker='o')
<matplotlib.axes._subplots.AxesSubplot at 0x1a1214ca10>

png

#Seaborn does not recongize null values so I have to use dropna()
sns.distplot(AAPL['Daily Return'].dropna(),bins=100,color='purple')
<matplotlib.axes._subplots.AxesSubplot at 0x1a11dd53d0>

png

#Using a histogram plot
AAPL['Daily Return'].hist()
<matplotlib.axes._subplots.AxesSubplot at 0x1a12630bd0>

png

# Grab all the closing prices for the tech stock list into one DataFrame
closingPrice_dataFrame = pdr.get_data_yahoo(['AAPL','MSFT','AMZN','GOOG'],start,end)['Adj Close']
closingPrice_dataFrame.head()
AAPL AMZN GOOG MSFT
Date
2017-11-03 172.500000 1111.599976 1032.479980 84.139999
2017-11-02 168.110001 1094.219971 1025.579956 84.050003
2017-11-01 166.889999 1103.680054 1025.500000 83.180000
2017-10-31 169.039993 1105.280029 1016.640015 83.180000
2017-10-30 166.720001 1110.849976 1017.109985 83.889999
techReturns_dataFrame = closingPrice_dataFrame.pct_change()
#Comparing Google to Google
sns.jointplot('GOOG','GOOG',techReturns_dataFrame,kind='scatter',color='cyan')
<seaborn.axisgrid.JointGrid at 0x1a12968410>

png

#Dealing with correlations now 
#Comparing Google with microsoft and seeing if there's any correlation 
sns.jointplot('GOOG','MSFT',techReturns_dataFrame,kind='scatter')
<seaborn.axisgrid.JointGrid at 0x1a12c35f90>

png

From what is presented above, there seem to be some sort of correlation forming

#A sense of what's correlated and what is not
from IPython.display import SVG
SVG(url='http://upload.wikimedia.org/wikipedia/commons/d/d4/Correlation_examples2.svg')

svg

#To see the correlation for all the companies all at once
sns.pairplot(techReturns_dataFrame.dropna())
<seaborn.axisgrid.PairGrid at 0x1a13cfb150>

png

Above are all the daily returns between on stocks (correlation)

#Full control of the figure, including the upper triangle, and the lower triangle
fig = sns.PairGrid(techReturns_dataFrame.dropna())

fig.map_upper(plt.scatter,color='red')
fig.map_lower(sns.kdeplot,cmap='cool_d')
fig.map_diag(plt.hist,bins=20)
<seaborn.axisgrid.PairGrid at 0x1a13d13890>

png

#Correlation of closing price
fig = sns.PairGrid(closingPrice_dataFrame.dropna())

fig.map_upper(plt.scatter,color='red')
fig.map_lower(sns.kdeplot,cmap='cool_d')
fig.map_diag(plt.hist,bins=20)
<seaborn.axisgrid.PairGrid at 0x1a13bb1610>

png

Risk Analysis

  • Gathering daily percentage returns by comparing the expected return with the standard deviation of the daily returns
returns = techReturns_dataFrame.dropna()
plt.scatter(returns.mean(), returns.std(), alpha=0.5,s = np.pi*20)
#Setting limits
plt.ylim([0.005,0.025])
plt.xlim([-0.003,0.002])

#Set the plot axis titles
plt.xlabel('Expected Returns')
plt.ylabel('Risk')

#Labelling the scatterplots:  http://matplotlib.org/users/annotations_guide.html
for label, x, y in zip(returns.columns, returns.mean(), returns.std()):
    plt.annotate(
        label, 
        xy = (x, y), xytext = (50, 50),
        textcoords = 'offset points', ha = 'right', va = 'bottom',
        arrowprops = dict(arrowstyle = '-', connectionstyle = 'arc3,rad=-0.3'))

png

Value at Risk

Using bootstrap method

Calculate the empirical quantiles from a histogram of daily returns. Quantiles information http://en.wikipedia.org/wiki/Quantile

#Repeating the daily returns for histogram for Apple Stock
#.displot = distribution plot 
sns.distplot(AAPL['Daily Return'].dropna(),bins=100,color='purple')
<matplotlib.axes._subplots.AxesSubplot at 0x1a16bb95d0>

png

#0.05 empirical quantiles of daily returns 
returns['AAPL'].quantile(0.05)
-0.01729882171950793

This is very interesting to see. With 0.05 empirical quantiles, the daily returns is -0.017. The definition of this is that with 95% confidence, the worst daily loss will not surpass 1.7%. If I had $10 000, my one day VaR (Value at Risk) would be $170

Value at Risk using the Monte Carlo Method

Brief Overview: Using the Monte Carlo Method to run many trials with random market conditions and therefore calculate losses/gains for each trial. Then using aggregation to from the trials to establish the risk of a certain stock.

The Stock Market will follow a random walk (Markov process) and is following the weak form of EMH (Efficient Market Hypothesis) The weak form of EMH states that the next price movement is conditionally dependent on past price movements given that the past prices have been incorporated.

This means that the exact price cannot be predicted perfectly solely based on past stocks information

EMH: https://en.wikipedia.org/wiki/Efficient-market_hypothesis

Makov Process: https://en.wikipedia.org/wiki/Markov_chain (A stochastic process that satisfies the Markov property if one can make predictions for the future of the process based solely on its present state just as well as one could knowing the process's full history)

Geometric Browninan Motion equation: (Markov Process)

ΔS/S = μΔt+σϵ√Δt

In this equation s = stock price, μ = expected return, σ = standard deviation of returns, t = time, ϵ = random variable

Therefore multiplying the stock price by both sides, the equation is equal to:

ΔS=S(μΔt+σϵ√Δt)

μΔt is known as the "drift" where average daily returns are multiplied by the change in time. σϵ√Δt is known as the "shock" and the shock is where it'll push the stocks either up or down. By doing drift and shock thousand of times, a simulation can occur to where a stock price might be. Techniques were summarized from here: http://www.investopedia.com/articles/07/montecarlo.asp

#Setting up the year: 
days = 365

#Setting dt
dt = 1/365

#Finding the dift of Google's dataframe 
mu = returns.mean()['GOOG']

#Calculating the volatility 
sigma = returns.std()['GOOG']
'''
The following function takes in starting stock prices, 
days of simulations, mu, and sigma. 
It returns a simulated price array 
'''

def stock_monte_carlo(start_price, days, mu, sigma):
    
    price = np.zeros(days)
    price[0] = start_price
    shock = np.zeros(days)
    drift = np.zeros(days)
    
    for x in xrange(1,365):
        #calculating the shock (σϵ√Δt )
        shock[x] = np.random.normal(loc=mu*dt,scale = sigma*np.sqrt(dt))
        #calculate Drift
        drift[x] = mu * dt 
        #calculate Price
        price[x] = price[x-1] + (price[x-1] * (drift[x] + shock[x]))
        
    return price 
GOOG.head()
Open High Low Close Adj Close Volume
Date
2016-11-07 774.500000 785.190002 772.549988 782.520020 782.520020 1585100
2016-11-08 783.400024 795.632996 780.190002 790.510010 790.510010 1350800
2016-11-09 779.940002 791.226990 771.669983 785.309998 785.309998 2607100
2016-11-10 791.169983 791.169983 752.179993 762.559998 762.559998 4745200
2016-11-11 756.539978 760.780029 750.380005 754.020020 754.020020 2431800
#Starting price
start_price = 774.50

for i in xrange(365):
    plt.plot(stock_monte_carlo(start_price, days, mu, sigma))

plt.xlabel("Days")
plt.ylabel("Price")
plt.title("Monte Carlo Analysis for Google")
Text(0.5,1,u'Monte Carlo Analysis for Google')

png

#Going to plot the above on a histogram for better visualization 

#Going to run this simulation 10000 times now
runs = 10000
simulations = np.zeros(runs)
np.set_printoptions(threshold = 4) #Or else the output would be far too long to read
for i in xrange(runs):
    #returning [days-1] because we're extracting the end date
    simulations[i] = stock_monte_carlo(start_price, days, mu, sigma)[days-1]; 
# q as the 1% empirical qunatile therefore 99% of the values should fall between here
q = np.percentile(simulations, 1)
    
# Plotting the distribution of the end prices
plt.hist(simulations,bins=200)

# Using plt.figtext to fill in some additional information onto the plot

# Starting Price
plt.figtext(0.6, 0.8, s="Start price: $%.2f" %start_price)
# Mean ending price
plt.figtext(0.6, 0.7, "Mean final price: $%.2f" % simulations.mean())

# Variance of the price (within 99% confidence interval)
plt.figtext(0.7, 0.6, "VaR(0.99): $%.2f" % (start_price - q,))

# Display 1% quantile
plt.figtext(0.15, 0.6, "q(0.99): $%.2f" % q)

# Plot a line at the 1% quantile result
plt.axvline(x=q, linewidth=4, color='r')

# Title
plt.title("Final price distribution for Google Stock after %s days" % days, weight='bold');

png

From above, the Value at Risk seems to be $19.50 for every $774.50 invested.
If a user was putting $774.50 as an initial investment, it means he's putting $19.50 at risk.

About

Analyzing stocks using statistics | Bootstrap | Monte Carlo Method

Resources

Stars

Watchers

Forks

Releases

No releases published

Packages

No packages published

Languages