diff --git a/INSTALL.md b/INSTALL.md new file mode 100644 index 000000000..118d18635 --- /dev/null +++ b/INSTALL.md @@ -0,0 +1,80 @@ +# Install and run 4CAT + +## Overview +4CAT has two components, the backend and the web tool. These share some bits +of code and a configuration file but apart from that they run independently. +Communication between the two happens via a PostgreSQL database. + +## Installation +After cloning the repository, copy `config.py-example` to `config.py` and edit +the file to match your machine's configuration. The various options are +explained in the file itself. + +Note that you need to create a database and database user yourself: this is +not handled by 4CAT. Upon first running the backend, it will create new tables +and indices in the database specified in `config.py`, so make sure the +configured database user has the rights to do so. + +Next, install the dependencies. While in the 4CAT root folder, run pip: + +``` +pip3 install -r requirements.txt +``` + +You should now be set up to run 4CAT. + +## Running 4CAT +### Running the backend +The backend can be run by navigating to the `backend` folder and using the +`backend.py` script in there to control the 4CAT backend daemon: + +``` +python3 backend.py start +``` + +Other valid arguments are `stop`, `restart` and `status`. Note that 4CAT was +made to run on a UNIX-like system and the above will not work on Windows. If +you want to use Windows (this is not recommended except for testing or +development, and disabled on UNIX-like systems) you can run `bootstrap.py`, +which will run the backend directly in the terminal. + +### Running the web tool +Next, start the web tool. Navigate to the `webtool` folder and run the 4CAT +Flask app: + +``` +FLASK_APP=fourcat flask run +``` + +With the default configuration, you can now navigate to +`http://localhost:5000` where you'll find the web tool that allows you to query +the database and create datasets. + +##Acquiring data +4CAT is not very useful with an empty database. To fill it with 4chan data, +you can either import data from elsewhere or scrape 4chan yourself (or do +both). + +###Import 4chan data dumps from elsewhere +Included in the `backend` folder is `import_dump.py`. You can use this script +to import dumps from 4plebs (e.g. +[these](https://archive.org/details/4plebs-org-data-dump-2018-01)). Run the +script without arguments for more information on its syntax. Note that for +larger boards, imports can take a long time to finish (multiple days). This is +due to the sheer size of the data sets, and because 4CAT needs full text +indices to search through the data. + +###Scrape 4chan yourself +The 4CAT backend comes with a 4chan API scraper that can capture new posts +on 4chan as they are posted. You can configure which boards are to be scraped +in `config.py`. Note that the 4chan API has a rate limit and scraping too many +boards will probably make you hit that limit quite quickly. It is recommended +that you keep an eye on the backend log files when you first start scraping to +make sure you're getting all the data you want. + +## Separating the backend and web tool +While by default the web tool and backend run on the same server, you could set +things up so that they run on separate servers instead. Simply only start the +backend on one server, and the frontend on the other. If you configure the +front end to connect to the database on another server (or vice versa), the backend +and front end will be able to communicate. \ No newline at end of file diff --git a/requirements.txt b/requirements.txt index 0829ad699..7d1e0773b 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,21 +1,3 @@ -requests==2.19.1 -psycopg2_binary==2.7.5 -html2text==2018.1.9 -numpy==1.15.2 -scipy==1.1.0 -stop_words==2018.7.23 -setuptools==40.0.0 -psutil==5.4.7 -Flask==1.0.2 -pandas==0.23.4 -gensim==3.6.0 -matplotlib==3.0.0 -mpld3==0.3 -APScheduler==3.5.3 -Flask_Limiter==1.0.1 -nltk==3.3 -Pillow==5.3.0 -adjustText==0.7.3 -beautifulsoup4==4.6.3 -psycopg2==2.7.5 -scikit_learn==0.20.0 +--index-url https://pypi.python.org/simple/ + +-e . \ No newline at end of file diff --git a/setup.py b/setup.py new file mode 100644 index 000000000..2ca5a1470 --- /dev/null +++ b/setup.py @@ -0,0 +1,40 @@ +from setuptools import setup + +with open("README.md", 'r') as readmefile: + readme = readmefile.read() + +setup( + name='fourcat', + version=1, + + description='4CAT: Capture and Analysis Tool is a comprehensive tool for analysing discourse on 4chan', + long_description=readme, + author="Open Intelligence Lab", + author_email="4cat@oilab.eu", + url="https://4cat.oilab.nl", + + packages=['backend', 'webtool'], + install_requires=[ + "requests==2.19.1", + "psycopg2_binary==2.7.5", + "html2text==2018.1.9", + "numpy==1.15.2", + "scipy==1.1.0", + "stop_words==2018.7.23", + "setuptools==40.0.0", + "psutil==5.4.7", + "Flask==1.0.2", + "pandas==0.23.4", + "gensim==3.6.0", + "matplotlib==3.0.0", + "mpld3==0.3", + "APScheduler==3.5.3", + "Flask_Limiter==1.0.1", + "nltk==3.3", + "Pillow==5.3.0", + "adjustText==0.7.3", + "beautifulsoup4==4.6.3", + "psycopg2==2.7.5", + "scikit_learn==0.20.0" + ] +)