From 9d2a4fe71df616114fc5eb3881b976b632024b58 Mon Sep 17 00:00:00 2001 From: Ashkan Farhangi Date: Wed, 18 May 2022 13:14:39 -0400 Subject: [PATCH] Updated ReadME.me --- .env | 12 - .vscode/configurationCache.log | 1 + .vscode/dryrun.log | 4 + .vscode/settings.json | 3 + .vscode/targets.log | 4 + Makefile | 144 --------- README.md | 35 +-- __init__.py | 4 + data/__init__.py | 0 data/__pycache__/__init__.cpython-38.pyc | Bin 0 -> 195 bytes data/__pycache__/dataloader.cpython-38.pyc | Bin 0 -> 653 bytes data/dataloader.py | 59 ++++ main.py | 20 ++ requirements.txt | 27 +- setup.py | 10 - src/__ini__.py | 0 src/__pycache__/metric.cpython-38.pyc | Bin 0 -> 212 bytes src/__pycache__/model.cpython-38.pyc | Bin 0 -> 210 bytes src/__pycache__/plot.cpython-38.pyc | Bin 0 -> 208 bytes src/__pycache__/trainer.cpython-38.pyc | Bin 0 -> 214 bytes src/__pycache__/utils.cpython-38.pyc | Bin 0 -> 1291 bytes src/metric.py | 7 + src/model.py | 123 ++++++++ src/plot.py | 14 + src/trainer.py | 339 +++++++++++++++++++++ src/utils.py | 31 ++ test_environment.py | 25 -- tox.ini | 3 - 28 files changed, 635 insertions(+), 230 deletions(-) delete mode 100644 .env create mode 100644 .vscode/configurationCache.log create mode 100644 .vscode/dryrun.log create mode 100644 .vscode/settings.json create mode 100644 .vscode/targets.log delete mode 100644 Makefile create mode 100644 __init__.py create mode 100644 data/__init__.py create mode 100644 data/__pycache__/__init__.cpython-38.pyc create mode 100644 data/__pycache__/dataloader.cpython-38.pyc create mode 100644 data/dataloader.py create mode 100644 main.py delete mode 100644 setup.py create mode 100644 src/__ini__.py create mode 100644 src/__pycache__/metric.cpython-38.pyc create mode 100644 src/__pycache__/model.cpython-38.pyc create mode 100644 src/__pycache__/plot.cpython-38.pyc create mode 100644 src/__pycache__/trainer.cpython-38.pyc create mode 100644 src/__pycache__/utils.cpython-38.pyc create mode 100644 src/metric.py create mode 100644 src/model.py create mode 100644 src/plot.py create mode 100644 src/trainer.py create mode 100644 src/utils.py delete mode 100644 test_environment.py delete mode 100644 tox.ini diff --git a/.env b/.env deleted file mode 100644 index 1d08ff3..0000000 --- a/.env +++ /dev/null @@ -1,12 +0,0 @@ -# Environment variables go here, can be read by `python-dotenv` package: -# -# `src/script.py` -# ---------------------------------------------------------------- -# import dotenv -# -# project_dir = os.path.join(os.path.dirname(__file__), os.pardir) -# dotenv_path = os.path.join(project_dir, '.env') -# dotenv.load_dotenv(dotenv_path) -# ---------------------------------------------------------------- -# -# DO NOT ADD THIS FILE TO VERSION CONTROL! diff --git a/.vscode/configurationCache.log b/.vscode/configurationCache.log new file mode 100644 index 0000000..bab9054 --- /dev/null +++ b/.vscode/configurationCache.log @@ -0,0 +1 @@ +{"buildTargets":[],"launchTargets":[],"customConfigurationProvider":{"workspaceBrowse":{"browsePath":[],"compilerArgs":[]},"fileIndex":[]}} \ No newline at end of file diff --git a/.vscode/dryrun.log b/.vscode/dryrun.log new file mode 100644 index 0000000..ebf85ed --- /dev/null +++ b/.vscode/dryrun.log @@ -0,0 +1,4 @@ +make.exe --dry-run --always-make --keep-going --print-directory +'make.exe' is not recognized as an internal or external command, +operable program or batch file. + diff --git a/.vscode/settings.json b/.vscode/settings.json new file mode 100644 index 0000000..65e1ec0 --- /dev/null +++ b/.vscode/settings.json @@ -0,0 +1,3 @@ +{ + "makefile.extensionOutputFolder": "./.vscode" +} \ No newline at end of file diff --git a/.vscode/targets.log b/.vscode/targets.log new file mode 100644 index 0000000..04f2128 --- /dev/null +++ b/.vscode/targets.log @@ -0,0 +1,4 @@ +make.exe all --print-data-base --no-builtin-variables --no-builtin-rules --question +'make.exe' is not recognized as an internal or external command, +operable program or batch file. + diff --git a/Makefile b/Makefile deleted file mode 100644 index 3400065..0000000 --- a/Makefile +++ /dev/null @@ -1,144 +0,0 @@ -.PHONY: clean data lint requirements sync_data_to_s3 sync_data_from_s3 - -################################################################################# -# GLOBALS # -################################################################################# - -PROJECT_DIR := $(shell dirname $(realpath $(lastword $(MAKEFILE_LIST)))) -BUCKET = no -PROFILE = no -PROJECT_NAME = ashfarhangi.github.com -PYTHON_INTERPRETER = python3 - -ifeq (,$(shell which conda)) -HAS_CONDA=False -else -HAS_CONDA=True -endif - -################################################################################# -# COMMANDS # -################################################################################# - -## Install Python Dependencies -requirements: test_environment - $(PYTHON_INTERPRETER) -m pip install -U pip setuptools wheel - $(PYTHON_INTERPRETER) -m pip install -r requirements.txt - -## Make Dataset -data: requirements - $(PYTHON_INTERPRETER) src/data/make_dataset.py data/raw data/processed - -## Delete all compiled Python files -clean: - find . -type f -name "*.py[co]" -delete - find . -type d -name "__pycache__" -delete - -## Lint using flake8 -lint: - flake8 src - -## Upload Data to S3 -sync_data_to_s3: -ifeq (default,$(PROFILE)) - aws s3 sync data/ s3://$(BUCKET)/data/ -else - aws s3 sync data/ s3://$(BUCKET)/data/ --profile $(PROFILE) -endif - -## Download Data from S3 -sync_data_from_s3: -ifeq (default,$(PROFILE)) - aws s3 sync s3://$(BUCKET)/data/ data/ -else - aws s3 sync s3://$(BUCKET)/data/ data/ --profile $(PROFILE) -endif - -## Set up python interpreter environment -create_environment: -ifeq (True,$(HAS_CONDA)) - @echo ">>> Detected conda, creating conda environment." -ifeq (3,$(findstring 3,$(PYTHON_INTERPRETER))) - conda create --name $(PROJECT_NAME) python=3 -else - conda create --name $(PROJECT_NAME) python=2.7 -endif - @echo ">>> New conda env created. Activate with:\nsource activate $(PROJECT_NAME)" -else - $(PYTHON_INTERPRETER) -m pip install -q virtualenv virtualenvwrapper - @echo ">>> Installing virtualenvwrapper if not already installed.\nMake sure the following lines are in shell startup file\n\ - export WORKON_HOME=$$HOME/.virtualenvs\nexport PROJECT_HOME=$$HOME/Devel\nsource /usr/local/bin/virtualenvwrapper.sh\n" - @bash -c "source `which virtualenvwrapper.sh`;mkvirtualenv $(PROJECT_NAME) --python=$(PYTHON_INTERPRETER)" - @echo ">>> New virtualenv created. Activate with:\nworkon $(PROJECT_NAME)" -endif - -## Test python environment is setup correctly -test_environment: - $(PYTHON_INTERPRETER) test_environment.py - -################################################################################# -# PROJECT RULES # -################################################################################# - - - -################################################################################# -# Self Documenting Commands # -################################################################################# - -.DEFAULT_GOAL := help - -# Inspired by -# sed script explained: -# /^##/: -# * save line in hold space -# * purge line -# * Loop: -# * append newline + line to hold space -# * go to next line -# * if line starts with doc comment, strip comment character off and loop -# * remove target prerequisites -# * append hold space (+ newline) to line -# * replace newline plus comments by `---` -# * print line -# Separate expressions are necessary because labels cannot be delimited by -# semicolon; see -.PHONY: help -help: - @echo "$$(tput bold)Available rules:$$(tput sgr0)" - @echo - @sed -n -e "/^## / { \ - h; \ - s/.*//; \ - :doc" \ - -e "H; \ - n; \ - s/^## //; \ - t doc" \ - -e "s/:.*//; \ - G; \ - s/\\n## /---/; \ - s/\\n/ /g; \ - p; \ - }" ${MAKEFILE_LIST} \ - | LC_ALL='C' sort --ignore-case \ - | awk -F '---' \ - -v ncol=$$(tput cols) \ - -v indent=19 \ - -v col_on="$$(tput setaf 6)" \ - -v col_off="$$(tput sgr0)" \ - '{ \ - printf "%s%*s%s ", col_on, -indent, $$1, col_off; \ - n = split($$2, words, " "); \ - line_length = ncol - indent; \ - for (i = 1; i <= n; i++) { \ - line_length -= length(words[i]) + 1; \ - if (line_length <= 0) { \ - line_length = ncol - indent - length(words[i]) - 1; \ - printf "\n%*s ", -indent, " "; \ - } \ - printf "%s ", words[i]; \ - } \ - printf "\n"; \ - }' \ - | more $(shell test $(shell uname) = Darwin && echo '--no-init --raw-control-chars') diff --git a/README.md b/README.md index b985c12..166e0a4 100644 --- a/README.md +++ b/README.md @@ -15,19 +15,22 @@ Arvix: ![](https://github.com/0415070/Protoformer/blob/main/visualization/arvix100.png) ### Installation -1. You can download a copy of all the files in this repository by cloning the following repo: +• You can download a copy of all the files in this repository by cloning the repo: - ``` + ```Python git clone https://github.com/ashfarhangi/Protoformer.git ``` -2. Install requirement packages +• Install requirement packages - ``` + ```Python pip install -r requirements.txt ``` -3. Enter your Twitter API keys in: +• Run model.py + +(optional) +• Enter your Twitter API keys in: To use the data properly, you need to use your own official Twitter API. Please replace the API_KEY with you own. as shown below: @@ -35,23 +38,13 @@ To use the data properly, you need to use your own official Twitter API. Please const API_KEY = 'ENTER YOUR API'; ``` -4. Run model.py after the dataset has been gathered -### Prerequisites -You'll need a working Python environment to run the code. -The recommended way to set up your environment is through the -[Anaconda Python distribution](https://www.anaconda.com/download/) which -provides the `conda` package manager. -Anaconda can be installed in your user directory and does not interfere with -the system Python installation. -The required dependencies are specified in the file `environment.yml`. We used `conda` virtual environments to manage the project dependencies in -isolation. Thus, you can install our dependencies without causing conflicts with your -setup (even with different Python versions). -Run the following command in the repository folder (where `environment.yml` -is located) to create a separate environment and install all required -dependencies in it: - - conda env create +### Prerequisites +``` +Python +git +pip +``` ## License diff --git a/__init__.py b/__init__.py new file mode 100644 index 0000000..fd40910 --- /dev/null +++ b/__init__.py @@ -0,0 +1,4 @@ + + + + diff --git a/data/__init__.py b/data/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/data/__pycache__/__init__.cpython-38.pyc b/data/__pycache__/__init__.cpython-38.pyc new file mode 100644 index 0000000000000000000000000000000000000000..b459afbf15563419e939d83c51649526825bfcae GIT binary patch literal 195 zcmWIL<>g`k0&exzBoO@=L?8o3AjbiSi&=m~3PUi1CZpdI zlUQtKY+z<-7UQ3n>Qa5W^WQFDdMT&|uODYxe(-fRj^Gb>ma}?Zi z@{2N45@XymOFT-GVgictOY+n5i*i$oVp0-I5@X`yGxIV_;^XxSDsOSv literal 0 HcmV?d00001 diff --git a/data/__pycache__/dataloader.cpython-38.pyc b/data/__pycache__/dataloader.cpython-38.pyc new file mode 100644 index 0000000000000000000000000000000000000000..7eac27cffcba94f87d3faaf881a87880b10ec205 GIT binary patch literal 653 zcmYjNOK%e~5cWQ@*=&-wX~pZdHy!Og=LX$Hb zxpGd9{3TyG@fSETn-&R==9_O`p3zLY-Gt!#@$lR1#3$re?fm~jbe^M{uLz7VPmt2f zJg?@y@XH_zJObgi!GfKaMG##zS%}j3?qfXK#w^-lHMF(~=GuGS*>+j;mL0J8mL2X$ z-H{!k_E?K03m@)-wMVazQ1mi`M)3F1o(4p}ACF zoX_T2g8x=p&A9;AERBN9GQ))(!Pe)Hi(+?8PHD5vcD19gS z;?f#q6NwRRxLv1ep1#ESnu_$5z!NY!ea-EgbsVz$EhhB*GFGqdC%m4rY literal 0 HcmV?d00001 diff --git a/data/dataloader.py b/data/dataloader.py new file mode 100644 index 0000000..ca03126 --- /dev/null +++ b/data/dataloader.py @@ -0,0 +1,59 @@ +from sklearn.model_selection import train_test_split +import numpy as np +import pandas as pd +import seaborn as sns +import matplotlib.pyplot as plt +import numpy as np +import pandas as pd +import seaborn as sns +import matplotlib.pyplot as plt +import torch +from torch import nn, optim +from torch.utils.data import Dataset, DataLoader +import torch.nn.functional as F +import warnings +warnings.filterwarnings('ignore') +import sklearn +from sklearn.model_selection import train_test_split +from sklearn.metrics import confusion_matrix,classification_report +from collections import defaultdict +from textwrap import wrap +from joblib import load, dump +import pickle +from tqdm import tqdm +import transformers +import datetime +import matplotlib.pylab as pylab +from sklearn.metrics.pairwise import cosine_similarity +from sklearn.decomposition import PCA +from scipy.stats import energy_distance +from fastdist import fastdist +df_imdb = pd.read_csv('.//data//IMDB.csv') +# df_imdb = df_imdb.sample(2000) +df_imdb.reset_index(drop=True,inplace=True) +sns.countplot(df_imdb.sentiment) +plt.ylabel('Samples') +plt.xlabel('IMDB Movie Sentiments') +plt.show() +# sns.countplot(df_embeddings.predicted_raw_difference) +df = df_imdb +df_profile = df_imdb +df_profile.columns = ['number','doc', 'labels_original'] +df_profile['labels'] = df_profile['labels_original'] + +le = LabelEncoder() +df_profile['labels']= le.fit_transform(df_profile['labels']) + +# X = df_profile.review +X = df_profile.doc +y = df_profile.labels +# z = df_profile.user_name +X_train,X_test,y_train,y_test= train_test_split(X,y,stratify=y,test_size=0.2, random_state=47) +print('number of training samples:', len(X_train)) +print('number of test samples:', len(X_test)) +train_df = pd.DataFrame({'doc':X_train, + 'labels':y_train}) +test_df = pd.DataFrame({'doc':X_test, + 'labels':y_test}) +train_df.reset_index(drop=True,inplace=True) +test_df.reset_index(drop=True,inplace=True) \ No newline at end of file diff --git a/main.py b/main.py new file mode 100644 index 0000000..704ed79 --- /dev/null +++ b/main.py @@ -0,0 +1,20 @@ +# ============================================================================= +# Main file +# ============================================================================= +from src import utils +from data import dataloader +from src import metric,model,plot,trainer + + +def run(): + """Builds model, loads data, trains and evaluates""" + model = Protoformer('twitter-uni') + # DistilBERT(twitter-uni) BERT(imdb) RoBERTa(arxiv-10) + model.load_data('twitter-uni') + # twitter-uni, imdb, arxiv-10 + model.build() + model.train() + model.evaluate() + +if __name__ == '__main__': + run() \ No newline at end of file diff --git a/requirements.txt b/requirements.txt index 9f70b0a..347b9b3 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,16 +1,13 @@ -# local package --e . - -# external requirements -click -Sphinx -coverage -awscli -flake8 -python-dotenv>=0.5.1 -shap==0.35.0 -shap +numpy +pandas +matplotlib transformers -sentence-transformers -torch==1.8.1 - +torch +fastdist +sklearn +seaborn +pickle +joblib +tqdm +pkbar +fastdist \ No newline at end of file diff --git a/setup.py b/setup.py deleted file mode 100644 index fc3d7b9..0000000 --- a/setup.py +++ /dev/null @@ -1,10 +0,0 @@ -from setuptools import find_packages, setup - -setup( - name='src', - packages=find_packages(), - version='0.1.0', - description='Description', - author='Ashkan Farhangi ', - license='BSD-3', -) diff --git a/src/__ini__.py b/src/__ini__.py new file mode 100644 index 0000000..e69de29 diff --git a/src/__pycache__/metric.cpython-38.pyc b/src/__pycache__/metric.cpython-38.pyc new file mode 100644 index 0000000000000000000000000000000000000000..e6b90021e416dcd9a65d62b24d6d778d4af8fe22 GIT binary patch literal 212 zcmWIL<>g`k0&|7dq~k#PF^Gc0*m6@#iZYY^ zG#PKP78GUXm8@haVg@P$6TecEtztrpQ;UjY5{u1@4a_XfV*K+`U5YZxQWbO+yz?^C zGfIkqtk685NKtWSNu@%5nu2p`UP)16j)GfGeoQEqBc fOmR_i48&%=g34PQHo5sJr8%i~EI^w-12F>t;#@i4 literal 0 HcmV?d00001 diff --git a/src/__pycache__/model.cpython-38.pyc b/src/__pycache__/model.cpython-38.pyc new file mode 100644 index 0000000000000000000000000000000000000000..850a19930797b6a2f4a504e20511fd5a59b47586 GIT binary patch literal 210 zcmWIL<>g`k0&|7dq+LMzF^Gcjd8sZ%nPsU8x(eQTnduoN z#XweQ9#EvHIJ2ZuAwNyQIW@1OC^1LDEhoPyGbJ&`J+s84G$|&aD8D2>Ex#x?wJ4^z dC^;qg`k0&|7dq(eaZF^Gc0SPF9TOZ+q$ zZ?P5>W#*NvWGG?=DgqO~l9R1sLW@(2ienOs&5RAqEX`v4^HN=kGRsmGbQQewGSf3k zih-=qJfKKXab`)SLVlWpb822mQDTmQTTXsaW=dj=duEA8X;Mr;QGQ8&T7FS(YEeva dQF08(UcG|KTO2mI`6;D2sdg+tdp`p)0|4XMI86Wm literal 0 HcmV?d00001 diff --git a/src/__pycache__/trainer.cpython-38.pyc b/src/__pycache__/trainer.cpython-38.pyc new file mode 100644 index 0000000000000000000000000000000000000000..6a5bca50c6455d6418c1751ad3a15c608d725570 GIT binary patch literal 214 zcmWIL<>g`k0&|7dq@zIkF^Gc0*h`8MGxJi5 z{4^PFu@)3%=9R2uC}IYx0~5c}lC5Gwi&Kk=V-kzaj19~z&0_rXQeBEN%Tg6|6}i@af+AFrPGn;193-`#~=F0YCdZ z3<4U^Py`TGvIg}^He?f8;mQ_4gj`KTGKN@gz(&Q_MO${DBfHRD@9W997KDM^g#MaI zAYsYLfHqFUH8X@E`f7@E@;sccxR%(G7vO^2hV4p^#6@`tE>$=dU&zaFd42Xv_zKT% zh_B@pxFSa|TJu-oD)McyBX?kD&0T|Q$c5rMT(9H}c@u6{wvM9*f?6d-C0B@G&%xXrgL#a2moM4c`Rg;b&jt z7vox4Q>tw0+;NFYG$D>y=E}AQKWjpnDO>zdX~V{0nK(nZnmK0O%;thSJd$KuEegxE znn~gee_w88f>_HJJR{E2hUMBgJl>&fK?>n0&%D|=HY7(64T1NR4O+d9jieWsW>T+S%Bj;PFs7B7fJ$JTwmhl%~($_h#z?m<1vi}C%k zsnGcnbIqZZ6Ge$FYdM7hMw|=n1fQq*(xbAql2FfugQn9aduwXw(bRE)q$1M7abZ)6 zB{F_IP1D=Xt!zrAi$Y~Sb0VPq4=%%JWqeGG;_A?rn+q-+GiN$P_E*8s2-|CrN==!V zSteNJ%tZc@{@+%Hwa%=G{rwxbl$}hA^mXW!}E9zERXuZ>O z%-eLmgPz~s%6W!Cm`%UAi>+36p^aq5mNDkaa6}j+|0$Uu^Rk&QQNYFjMV97-n`3U7 z>3e@$XJ*PWEpzOxuaHT+tz$EuFH`KLvu1#d8CLY`$1f%?f0?~``gDRnFHr^Y^pT1OVV)zjo?T&s@ z{P3-_hz=Ao218no^kVdcDNJ6Bo(XL@CDR|ddtS_^uZ(tjRgkGQ*>p7oI92iY*s4OX hM}8LUJ}z+>hh6*= 0: + df_embeddings_copy.loc[j,'larget_logit'] = df_embeddings_copy.loc[j, 'predict_c_0'] + if df_embeddings_copy.loc[j, 'predict_c_1'] >= 0: + df_embeddings_copy.loc[j,'larget_logit'] = df_embeddings_copy.loc[j, 'predict_c_1'] + + +fig, ax = plt.subplots() +fig.canvas.draw() + +ax = sns.distplot([df_embeddings_copy[df_embeddings_copy.wrong == 0].larget_logit_minmax] + ,label = 'Correct',color='black',kde=True,bins=16) +ax = sns.distplot([df_embeddings_copy[df_embeddings_copy.wrong == 1].larget_logit_minmax] + ,label = 'Wrong', + color='#FF1B1C',bins=8,kde=True) +plt.xlabel('Highest Logit') +labels = [item.get_text() for item in ax.get_yticklabels()] +ax.set_yticklabels(labels) + +plt.legend(loc=1) +plt.show() + + +# https://github.com/talboger/fastdist +def distance_matrix(df_embeddings): + dim = len(df_embeddings) # - 1500 + s_ij = np.zeros([dim,dim]) + e_ij = np.zeros([dim,dim]) + l1norm_ij = np.zeros([dim,dim]) + l2norm_ij = np.zeros([dim,dim]) + l3norm_ij = np.zeros([dim,dim]) + for j in tqdm(range(dim)): + for z in range(j,dim): + c = fastdist.cosine_matrix_to_matrix(df_embeddings.embedding[j], df_embeddings.embedding[z]) + return s_ij,e_ij,l1norm_ij,l2norm_ij,l3norm_ij +s_ij,e_ij,l1norm_ij,l2norm_ij,l3norm_ij = distance_matrix(df_embeddings) +mask = np.tril(np.ones_like(s_ij, dtype=bool)) +np.fill_diagonal(mask,False) +cmap = sns.diverging_palette(230, 20, as_cmap=True) +s_ij_rank = s_ij.reshape(-1) +s_ij_rank.transpose() +df_similarity_rank = pd.DataFrame(s_ij_rank) +df_similarity_rank = df_similarity_rank.replace(0,np.nan).dropna() +df_similarity_rank.reset_index(drop=True,inplace=True) +df_similarity_rank.describe() +SIM_BASE = df_similarity_rank.sort_values(0).quantile(0.2).values + +df_similarity = pd.DataFrame(s_ij) +df_similarity = df_similarity.replace(0,np.nan) +df_similarity.describe() +df_similarity +df_density = df_similarity + +for j in tqdm(range(len(df_density))): + for z in range(j,len(df_density.loc[j])): + df_density.loc[j,z] = np.sign(df_density.loc[j,z] - SIM_BASE) +df_embeddings['density'] = 0 +df_embeddings['average_sim_all_peers'] = 0 +for j in range(len(df_density)): + df_embeddings.loc[j,'density'] = array_density[j].sum() + df_embeddings.loc[j,'average_sim_all_peers'] = array_similarity[j].mean() + +df_embeddings_sorted = df_embeddings.sort_values(['density', 'average_sim_all_peers'],ascending=[False,False]) +df_embeddings_sorted.reset_index(drop=True,inplace=True) +# 0,1 > 1 | 2,3,4,5 > 2 +# imdb@ +# For Twitter +# cp_list = [[],[],[],[],[],[],[],[]] +# For IMDB +cp_list = [[],[]] + +for x in range(len(news_groups)): + proto_list = [] + begin = 0 + z = 0 + p_num = 0 + proto_list.append(df_embeddings_sorted.loc[0,'sample_id']) + df = df_embeddings_sorted[df_embeddings_sorted.labels == x] + df.reset_index(drop=True,inplace=True) + for j in range(1,len(df)): + if (z<=2**(p_num+1)): + z = z + 1 + if (z == 2**(p_num+1)): + z = 0 + end = j + p = df.loc[begin,'sample_id'] + for b in range(begin, end): + i_sel = df.loc[b,'sample_id'] + if (np.mean(array_similarity[i_sel][proto_list]) < np.mean(array_similarity[p][proto_list])): + p = i_sel + proto_list.append(p) + begin = j + p_num = p_num + 1 + cp_list[x] = proto_list[1:] +print(cp_list) + +df_embeddings_sorted = df_embeddings.sort_values(['density', 'average_sim_all_peers'],ascending=[True,False]) +df_embeddings_sorted.reset_index(drop=True,inplace=True) + +ap_list = [[],[]] +for x in range(len(news_groups)): + proto_list = [] + begin = 0 + z = 0 + p_num = 0 + df = df_embeddings_sorted[df_embeddings_sorted.labels == x] + df.reset_index(drop=True,inplace=True) + for j in range(1,len(df)): + if (z<=2**(p_num+1)): + z = z + 1 + if (z == 2**(p_num+1)): + z = 0 + end = j + p = df.loc[begin,'sample_id'] + for b in range(begin, end): + i_sel = df.loc[b,'sample_id'] + if (np.mean(array_similarity[i_sel][proto_list]) < np.mean(array_similarity[p][proto_list])): + p = i_sel + proto_list.append(p) + begin = j + p_num = p_num + 1 + ap_list[x] = proto_list +print(ap_list) + +df_embeddings['class_prototype'] = 0 +for x in range(len(cp_list)): + for j in cp_list[x]: + df_embeddings.loc[j,'class_prototype'] = 1 + + +df_embeddings['anomaly_prototype'] = 0 +for x in range(len(ap_list)): + for j in ap_list[x]: + df_embeddings.loc[j,'anomaly_prototype'] = 1 + + +df_embeddings['larget_logit'] = 0 +df_embeddings['confidence'] = 0 + +for j in range(len(df_embeddings)): + df_embeddings.loc[j,'confidence'] = abs(abs(df_embeddings.loc[j, 'predict_c_0']) - abs(df_embeddings.loc[j, 'predict_c_1'])) + if df_embeddings.loc[j, 'predict_c_0'] >= 0: + df_embeddings.loc[j,'larget_logit'] = df_embeddings.loc[j, 'predict_c_0'] + if df_embeddings.loc[j, 'predict_c_1'] >= 0: + df_embeddings.loc[j,'larget_logit'] = df_embeddings.loc[j, 'predict_c_1'] + +df = df_embeddings[df_embeddings.average_sim_all_peers!=0] +df.reset_index(drop=True,inplace=True) +sns.scatterplot(df.density,df.average_sim_all_peers,s=80,label='Embeddings',c=df.confidence, cmap='gray') +plt.title('Scatter plot of embeddings density for class'+' 0') +plt.xlabel('Proximity') +plt.ylabel('Similarity') +plt.legend() +plt.show() \ No newline at end of file diff --git a/src/utils.py b/src/utils.py new file mode 100644 index 0000000..5f17f4b --- /dev/null +++ b/src/utils.py @@ -0,0 +1,31 @@ +# ============================================================================= +# Misc. Utilities +# ============================================================================= +import numpy as np +import pandas as pd +import seaborn as sns +import matplotlib.pyplot as plt +import torch +from torch import nn, optim +from torch.utils.data import Dataset, DataLoader +import torch.nn.functional as F +import warnings +warnings.filterwarnings('ignore') +import sklearn +from sklearn.model_selection import train_test_split +from sklearn.metrics import confusion_matrix,classification_report +from collections import defaultdict +from textwrap import wrap +from joblib import load, dump +import pickle +from tqdm import tqdm +import transformers +import datetime +import matplotlib.pylab as pylab +from sklearn.metrics.pairwise import cosine_similarity +from sklearn.decomposition import PCA +from scipy.stats import energy_distance +from fastdist import fastdist +RANDOM_SEED = 47 +torch.manual_seed(RANDOM_SEED) +device = torch.device ("cuda:0" if torch.cuda.is_available() else "cpu") diff --git a/test_environment.py b/test_environment.py deleted file mode 100644 index d0ac4a7..0000000 --- a/test_environment.py +++ /dev/null @@ -1,25 +0,0 @@ -import sys - -REQUIRED_PYTHON = "python3" - - -def main(): - system_major = sys.version_info.major - if REQUIRED_PYTHON == "python": - required_major = 2 - elif REQUIRED_PYTHON == "python3": - required_major = 3 - else: - raise ValueError("Unrecognized python interpreter: {}".format( - REQUIRED_PYTHON)) - - if system_major != required_major: - raise TypeError( - "This project requires Python {}. Found: Python {}".format( - required_major, sys.version)) - else: - print(">>> Development environment passes all tests!") - - -if __name__ == '__main__': - main() diff --git a/tox.ini b/tox.ini deleted file mode 100644 index c32fbd8..0000000 --- a/tox.ini +++ /dev/null @@ -1,3 +0,0 @@ -[flake8] -max-line-length = 79 -max-complexity = 10