This notebook demonstrates:
Data:
Feature / cell matrix HDF5 (filtered)
(.h5 format) and Clustering analysis
(.tar format) files. Note:
# import modules, define some functions for loading, saving and processing a gene-barcode matrix
%matplotlib inline
import collections
import matplotlib
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import scipy.sparse as sp_sparse
import h5py
np.random.seed(0)
FeatureBCMatrix = collections.namedtuple('FeatureBCMatrix', ['feature_ids', 'feature_names', 'barcodes', 'matrix'])
def get_matrix_from_h5(filename):
with h5py.File(filename) as f:
if u'version' in f.attrs:
if f.attrs['version'] > 2:
raise ValueError('Matrix HDF5 file format version (%d) is an newer version that is not supported by this function.' % version)
else:
raise ValueError('Matrix HDF5 file format version (%d) is an older version that is not supported by this function.' % version)
feature_ids = [x.decode('ascii', 'ignore') for x in f['matrix']['features']['id']]
feature_names = [x.decode('ascii', 'ignore') for x in f['matrix']['features']['name']]
barcodes = list(f['matrix']['barcodes'][:])
matrix = sp_sparse.csc_matrix((f['matrix']['data'], f['matrix']['indices'], f['matrix']['indptr']), shape=f['matrix']['shape'])
return FeatureBCMatrix(feature_ids, feature_names, barcodes, matrix)
def get_expression(fbm, gene_name):
try:
gene_index = feature_bc_matrix.feature_names.index(gene_name)
except ValueError:
raise Exception("%s was not found in list of gene names." % gene_name)
return fbm.matrix[gene_index, :].toarray().squeeze()
filtered_matrix_h5 = "neuron_10k_v3_filtered_feature_bc_matrix.h5"
%time feature_bc_matrix = get_matrix_from_h5(filtered_matrix_h5)
# untar the secondary analysis
# alternatively, copy these to the command line (omitting the initial '!' character)
!tar -xzf neuron_10k_v3_analysis.tar.gz
# load tSNE and graph clustering
tsne = pd.read_csv("analysis/tsne/2_components/projection.csv")
clusters = pd.read_csv("analysis/clustering/graphclust/clusters.csv")
# calculate UMIs and genes per cell
umis_per_cell = np.asarray(feature_bc_matrix.matrix.sum(axis=0)).squeeze()
genes_per_cell = np.asarray((feature_bc_matrix.matrix > 0).sum(axis=0)).squeeze()
# plot UMIs per cell
plt.hist(np.log10(umis_per_cell), bins=50)
plt.xlabel('UMIS per cell (log10)')
plt.ylabel('Frequency')
plt.title('UMI Distribution')
plt.show()
# plot genes per cell
plt.hist(genes_per_cell, bins=50)
plt.xlabel('Genes per Cell')
plt.ylabel('Frequency')
plt.title('Gene Distribution')
plt.show()
# plot clusters in TSNE space
plt.figure(figsize=(8, 8))
plt.scatter(tsne['TSNE-1'], tsne['TSNE-2'], c=clusters['Cluster'], linewidths=0, s=5)
plt.title('Graph-Based Clustering')
plt.show()
# plot three markers: Stmn2 (pan-neuronal), Tbr1 (excitatory), Olig1 (oligodendrocytes)
marker_genes = ['Stmn2', 'Tbr1', 'Olig1']
f, axes = plt.subplots(1, len(marker_genes), figsize=(5*len(marker_genes), 4))
for gene, axis in zip(marker_genes, axes):
expr = get_expression(feature_bc_matrix, gene)
axis.scatter(tsne['TSNE-1'], tsne['TSNE-2'], c=expr, s=5, linewidths=0, cmap=plt.cm.Reds)
axis.set_title(gene)
plt.show()