Skip to content

Commit

Permalink
polishing
Browse files Browse the repository at this point in the history
  • Loading branch information
greenkidneybean committed Jun 14, 2018
1 parent 76860f6 commit d938951
Show file tree
Hide file tree
Showing 8 changed files with 289 additions and 40 deletions.
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -1 +1,2 @@
.DS_Store
testing.ipynb
6 changes: 6 additions & 0 deletions .ipynb_checkpoints/Untitled-checkpoint.ipynb
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
{
"cells": [],
"metadata": {},
"nbformat": 4,
"nbformat_minor": 1
}
33 changes: 28 additions & 5 deletions README.md
Original file line number Diff line number Diff line change
@@ -1,9 +1,21 @@
# Circos-Py
## Visualize Antibody Clones as Circos Plots
## B-Cell Clones as Circos Plots

This script was made to convert spreadsheets of antibody clones into [Circos](http://circos.ca/) plots with different colors representing phenotypic characteristics of the antibody clones.
This script was made to illustrate the expansion of B-cell clones over multiple timepoints by converting a spreadsheet input to a [Circos](http://circos.ca/) plot. In this specific case, the orange and blue colors depict specific B-cell phenotypes.

![example](example.png)
![example_input_data](example.png)

With the circos-py repository installed and setup, the above plot can be generated by navigating to the `circos-py` directory at the command line and running the `make_circos_plots.py` script with the example input data as shown below.

```shell
cd ~/circos-py
python make_circos_plots.py example_input_data.csv
```

## Install Circos-Py

## Setup
### Add Circos Binary to Your Shell Path

### Changes in the Circos Defaults

Expand All @@ -15,8 +27,19 @@ I'm also delimiting all my data files (karyotype and links) with tabs instead of

`file_delim = \t`

## Input File Requirements
- Input file must be an Excel workbook (.xlsx)
- Each sample must have its own sheet
- The sheet name will be used as a label for all data generated from that sheet
- Each sheet requires the following 4 columns
- `timepoint` - identifies the timepoint from which the sequence was derived
- `clone` - used to assign the sequence to a clone group
- `seq_id` - unique identifier for the sequence
- `spec` - phenotypic or subpopulation information
## Output Circos Plots

**ToDo:**
- Add example data
- Create py_circos module of functions
- Add two more sample to example data
- create circos module with all functions
- Lock package dependencies
- Web GUI
Binary file added __pycache__/circos.cpython-36.pyc
Binary file not shown.
212 changes: 212 additions & 0 deletions circos.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,212 @@
# Functions:
# - circos_input_file
# - format_dataframe
# - make_top_karyotype
# - make_bands_karyotype
# - generate_circos_links

import argparse
import datetime
import glob
import numpy as np
import os
import pandas as pd
import shutil
import subprocess

def circos_input_file(file, spec_dict):
"""
Custom create pandas dataframe from .csv circos_input_file
Strip spaces and hyphens in column names
Remove empty rows
"""
output_file_name = spec_dict['output_file_name']
df = pd.read_csv(file)
# strip spaces and hyphens in column names
# this could cause a problem in the future
df.columns = df.columns.str.strip().str.lower().str.replace(' ', '_').str.replace('-','_')
# drop any empty rows
df = df.dropna(axis=0, how='all')
# add circos_time column for timepoint labels (e.g. "Day000")
df['circos_time'] = 'Day' + df.timepoint.str.split('-').str[1]
# count the number of sequences in the clone
df['total_clone_count'] = df.groupby('clone')['clone'].transform('count')
# count the number of timepoints the clone shows up in
df['timepoint_clone_count'] = df.groupby(['timepoint','clone'])['clone'].transform('count')
# sort the df by time, then by clone count by timepoint, then with ascending clone numbers
df = df.sort_values(['timepoint', 'timepoint_clone_count', 'clone'], ascending=True, na_position='first')
df.to_csv(f'sorted_timepoints_{output_file_name}.csv', sep=',', index=False, header=True)
df = df.reset_index(drop=True)
if 'spec_list' in spec_dict:
# downselect dataframe if looking for just h7 or stem sequences
df = df[df['spec'].isin(spec_dict['spec_list'])]
return df

def format_dataframe(dataframe, spec_dict):
"""
use this to format an existing dataframe
"""
df = dataframe
df.columns = df.columns.str.strip().str.lower().str.replace(' ', '_').str.replace('-','_')
# drop any empty rows
df = df.dropna(axis=0, how='all')
# add circos_time column for timepoint labels (e.g. "Day000")
df['circos_time'] = 'Day' + df.timepoint.str.split('-').str[1]
# count the number of sequences in the clone
df['total_clone_count'] = df.groupby('clone')['clone'].transform('count')
# count the number of timepoints the clone shows up in
df['timepoint_clone_count'] = df.groupby(['timepoint','clone'])['clone'].transform('count')
# sort the df by time, then by clone count by timepoint, then with ascending clone numbers
df = df.sort_values(['timepoint', 'timepoint_clone_count', 'clone'], ascending=True, na_position='first')
#df.to_csv(f'sorted_timepoints_{output_file_name}.csv', sep=',', index=False, header=True)
if 'spec_list' in spec_dict:
# downselect dataframe if looking for just h7 or stem sequences
df = df[df['spec'].isin(spec_dict['spec_list'])]
df.reset_index(drop=True, inplace=True)
return df

def make_top_karyotype(input_dataframe, spec_dict):
"""
make the top of the circos karyotype
"""
output_file_name = spec_dict['output_file_name']
karyotype = input_dataframe['circos_time'].value_counts(sort=False).sort_index().to_frame(name='END')
karyotype['ID'] = karyotype.index
karyotype.reset_index(drop=True, inplace=True)
karyotype['#chr'] = 'chr'
karyotype['-'] = '-'
karyotype['START'] = 0
karyotype['LABEL'] = 'd' + karyotype['ID'].str[3:].apply(int).apply(str) + ' (' + karyotype['END'].apply(str) + ')'
karyotype['COLOR'] = ['chr'+ str(i+1) for i in range(len(karyotype))]
cols = ['#chr', '-', 'ID', 'LABEL', 'START', 'END', 'COLOR']
karyotype = karyotype[cols]
karyotype.to_csv(f'my_karyotype.txt', sep='\t', index=False)
return karyotype

def make_bands_karyotype(input_dataframe, spec_dict):
output_file_name = spec_dict['output_file_name']
# create bands
#create band start and stop points
band_start = [val for i in input_dataframe.groupby('timepoint')['timepoint'].count().tolist() for val in range(i)]
band_stop = [x+1 for x in band_start]

# add them as a column
input_dataframe['#band'] = 'band'
input_dataframe['band_start'] = band_start
input_dataframe['band_stop'] = band_stop

input_dataframe['band_color'] = ''
a = {True:'gneg',False:'gpos25'}
b = True

for i in range(len(input_dataframe)):
if i == 0:
input_dataframe.loc[i,'band_color'] = a[b]
pass
# for matching above row
else:
above = input_dataframe.loc[i-1,'clone']
current = input_dataframe.loc[i,'clone']
if above == current:
input_dataframe.loc[i,'band_color'] = a[b]
else:
b = not b
input_dataframe.loc[i,'band_color'] = a[b]

# create separate bands df
bands = input_dataframe[['#band', 'circos_time', 'seq_id', 'seq_id', 'band_start', 'band_stop', 'band_color']].copy()

# save the bands file (tab delimited)
bands.to_csv(f'bands.txt', sep='\t', index=False, header=True)

# append to karyotype file
with open(f'my_karyotype.txt', 'a') as f:
bands.to_csv(f, sep='\t', index=False )

return bands

def hue_list(samples=20, color=None):
'''
Generate a list of hues evenly spaced based on the sample number
hue### goes from 0-360
'''

if color == None:
color = 'all'

color_list = {
'all':[0,360],
'red_yellow':[0,60],
'yellow_green':[60,140],
'green_blue':[140,255],
'blue_purple':[185,275],
'purple_pink':[275,310],
'blue':[185,235],
'red':[0,20],
'red_orange':[0,30]}

# check to see of the color is in the list

for key,item in color_list.items():
if key == color:
return ['hue'+str(x).zfill(3) for x in np.linspace(item[0], item[1], samples, dtype=int)]

# generate links
def generate_circos_links(input_dataframe, spec_dict):
output_file_name = spec_dict['output_file_name']

link_df = input_dataframe[(input_dataframe['spec'].isin(spec_dict['spec_list'])) & (input_dataframe['clone'].isnull() == False)]

clones = link_df.groupby(['circos_time','clone'], sort=False)['band_start'].min().reset_index()
clones['band_stop'] = link_df.groupby(['circos_time', 'clone'], sort=False)['band_stop'].max().reset_index()['band_stop']
clones = clones.sort_values(['clone','circos_time']).reset_index(drop=True)

# generate a links df
cols = clones.columns.tolist()
df_1 = pd.DataFrame(columns=cols)
df_2 = pd.DataFrame(columns=cols)

for i,row in clones.iterrows():
below = i + 1
# if statement checks for last row in the clones dataframe
if below > clones.index.max():
continue
else:
if row['clone'] == clones.iloc[below]['clone']:
#print("here's clone: ",row['clone'])
#print(row)
df_1.loc[i] = row
df_2.loc[i] = clones.iloc[below]
else:
continue

# change column titles in df_2 before concat into links
df_2.columns = ['circos_time_2', 'clone_2', 'band_start_2', 'band_stop_2']
# concat the df_1 and df_1 to create base for links document
links = pd.concat([df_1, df_2], axis=1)
links = links.sort_values(['circos_time','band_start']).reset_index(drop=True)
# band start/stop need to be converted back to integers
links[['band_start','band_stop','band_start_2','band_stop_2']] = links[['band_start','band_stop','band_start_2','band_stop_2']].astype(int)

# add the unique colors here
link_keys = links['clone'].unique().tolist()
link_hues = hue_list(len(link_keys), color=spec_dict['color']) # this is gonna be a problem
link_values = ['color='+i+'_a2' for i in link_hues]
link_color_dict = dict(zip(link_keys, link_values))
links['link_color'] = links['clone'].map(link_color_dict)
final_links = links[['circos_time', 'band_start', 'band_stop', 'circos_time_2', 'band_start_2','band_stop_2', 'link_color']].copy()
final_links=final_links.rename(columns={'circos_time':'#ciros_time'})

# write link files
final_links.to_csv(f'links_{output_file_name}.txt', sep='\t', index=False, header=True)

# this is dangerous because it will layer the links.txt with each iteration
if os.path.isfile('links.txt') is False:
final_links.to_csv('links.txt', sep='\t', index=False, header=True)
else:
with open('links.txt', 'a') as f:
final_links.to_csv(f, sep='\t', index=False)

return final_links
Binary file modified example.png
100755 → 100644
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file added example_data_input.xlsx
Binary file not shown.
Loading

0 comments on commit d938951

Please sign in to comment.