raphg
diff --git a/Diff for: ‎Advanced_data_manipulation.Rmd
+8 b/Diff for: ‎Advanced_data_manipulation.Rmd
+8
diff --git a/Diff for: ‎Advanced_data_manipulation.html
+38-58 b/Diff for: ‎Advanced_data_manipulation.html
+38-58
diff --git a/Diff for: ‎Advanced_data_manipulation.md
+41-60 b/Diff for: ‎Advanced_data_manipulation.md
+41-60
diff --git a/Diff for: ‎Bioconductor_intro.Rmd
+7-5 b/Diff for: ‎Bioconductor_intro.Rmd
+7-5
@@ -669,6 +669,14 @@ DT
 DT[,Name:=NULL]
 ```
 
+## Listing all tables
+
+With data.table you can always list the tables that you've created, which will also return basic information on this tables including size, keys, nrows, etc.
+
+```{r}
+tables()
+```
+
 
 ## Bonuses: fread
 
 
@@ -950,6 +950,22 @@ <h3><code>.GRP</code></h3>
 ## 2:      3
 ## 3:      4</pre>
 
+</article></slide><slide class=''><hgroup><h2>Listing all tables</h2></hgroup><article  id="listing-all-tables" class="smaller ">
+
+<p>With data.table you can always list the tables that you&#39;ve created, which will also return basic information on this tables including size, keys, nrows, etc.</p>
+
+<pre class = 'prettyprint lang-r'>tables()</pre>
+
+<pre >##      NAME        NROW NCOL MB COLS         KEY
+## [1,] big_dt 1,000,000    3 20 x,y,z           
+## [2,] dt             3    3  1 x,y,z           
+## [3,] DT             3    2  1 Name,Salary     
+## [4,] DT1            5    4  1 x,y,z,newcol x  
+## [5,] DT2            1    3  1 x,y,w        x  
+## [6,] tmp1     456,976    3 32 x,y,z        x  
+## [7,] tmp2     456,976    3 32 x,y,z        x  
+## Total: 88MB</pre>
+
 </article></slide><slide class=''><hgroup><h2>Bonuses: fread</h2></hgroup><article  id="bonuses-fread" class="smaller ">
 
 <p><code>data.table</code> also comes with <code>fread</code>, a file reader much, much better than <code>read.table</code> or <code>read.csv</code>:</p>
@@ -965,8 +981,8 @@ <h3><code>.GRP</code></h3>
 
 <pre >## Unit: milliseconds
 ##   expr       min        lq      mean    median        uq       max neval
-##  fread  310.5437  310.5437  310.5437  310.5437  310.5437  310.5437     1
-##    r.t 7050.3093 7050.3093 7050.3093 7050.3093 7050.3093 7050.3093     1</pre>
+##  fread  331.6005  331.6005  331.6005  331.6005  331.6005  331.6005     1
+##    r.t 7447.6770 7447.6770 7447.6770 7447.6770 7447.6770 7447.6770     1</pre>
 
 <pre class = 'prettyprint lang-r'>unlink(file)</pre>
 
@@ -984,12 +1000,12 @@ <h3><code>.GRP</code></h3>
 <pre class = 'prettyprint lang-r'>microbenchmark(DT = rbindlist(dfs), DF = do.call(rbind, dfs), times = 5)</pre>
 
 <pre >## Unit: milliseconds
-##  expr        min         lq       mean    median         uq       max
-##    DT   5.981805   8.551772   8.997102   9.64301   9.767579  11.04134
-##    DF 709.929230 843.444154 869.686238 884.76882 954.812659 955.47633
-##  neval cld
-##      5  a 
-##      5   b</pre>
+##  expr       min        lq      mean     median         uq       max neval
+##    DT  10.02123  10.46328   10.7023   10.62325   10.65706   11.7467     5
+##    DF 787.99593 984.21012 1088.7448 1060.43489 1142.37993 1468.7029     5
+##  cld
+##   a 
+##    b</pre>
 
 </article></slide><slide class=''><hgroup><h2>Summary</h2></hgroup><article  id="summary" class="smaller ">
 
@@ -1044,44 +1060,8 @@ <h3><code>.GRP</code></h3>
 require(org.Hs.eg.db) || biocLite(&quot;org.Hs.eg.db&quot;)</pre>
 
 <pre class = 'prettyprint lang-r'># Now we can use the org.Hs.eg.db to load a database
-library(org.Hs.eg.db)</pre>
-
-<pre >## Loading required package: AnnotationDbi
-## Loading required package: BiocGenerics
-## Loading required package: parallel
-## 
-## Attaching package: &#39;BiocGenerics&#39;
-## 
-## The following objects are masked from &#39;package:parallel&#39;:
-## 
-##     clusterApply, clusterApplyLB, clusterCall, clusterEvalQ,
-##     clusterExport, clusterMap, parApply, parCapply, parLapply,
-##     parLapplyLB, parRapply, parSapply, parSapplyLB
-## 
-## The following object is masked from &#39;package:stats&#39;:
-## 
-##     xtabs
-## 
-## The following objects are masked from &#39;package:base&#39;:
-## 
-##     anyDuplicated, append, as.data.frame, as.vector, cbind,
-##     colnames, do.call, duplicated, eval, evalq, Filter, Find, get,
-##     intersect, is.unsorted, lapply, Map, mapply, match, mget,
-##     order, paste, pmax, pmax.int, pmin, pmin.int, Position, rank,
-##     rbind, Reduce, rep.int, rownames, sapply, setdiff, sort,
-##     table, tapply, union, unique, unlist
-## 
-## Loading required package: Biobase
-## Welcome to Bioconductor
-## 
-##     Vignettes contain introductory material; view with
-##     &#39;browseVignettes()&#39;. To cite Bioconductor, see
-##     &#39;citation(&quot;Biobase&quot;)&#39;, and for packages &#39;citation(&quot;pkgname&quot;)&#39;.
-## 
-## Loading required package: GenomeInfoDb
-## Loading required package: DBI</pre>
-
-<pre class = 'prettyprint lang-r'># Create a connection
+library(org.Hs.eg.db)
+# Create a connection
 Hs_con &lt;- org.Hs.eg_dbconn()</pre>
 
 </article></slide><slide class=''><hgroup><h2>Using RSQLite</h2></hgroup><article  id="using-rsqlite-1" class="smaller ">
@@ -1120,16 +1100,16 @@ <h3><code>.GRP</code></h3>
 
 <pre class = 'prettyprint lang-r'>gc()</pre>
 
-<pre >##           used (Mb) gc trigger  (Mb) max used (Mb)
-## Ncells 1141630 61.0    2320208 124.0  1628720 87.0
-## Vcells 1261314  9.7    3648212  27.9  1986142 15.2</pre>
+<pre >##            used  (Mb) gc trigger  (Mb) max used  (Mb)
+## Ncells  1840419  98.3    4643607 248.0  4276538 228.4
+## Vcells 15889273 121.3   47075374 359.2 47071241 359.2</pre>
 
 <pre class = 'prettyprint lang-r'>alias &lt;- dbGetQuery(Hs_con, &quot;SELECT * FROM alias;&quot;)
 gc()</pre>
 
-<pre >##           used (Mb) gc trigger  (Mb) max used (Mb)
-## Ncells 1245119 66.5    3288291 175.7  1628720 87.0
-## Vcells 1625283 12.4    3648212  27.9  2370850 18.1</pre>
+<pre >##            used  (Mb) gc trigger  (Mb) max used  (Mb)
+## Ncells  1943826 103.9    4643607 248.0  4276538 228.4
+## Vcells 16187585 123.6   47075374 359.2 47071241 359.2</pre>
 
 <pre class = 'prettyprint lang-r'>gene_info &lt;- dbGetQuery(Hs_con, &quot;SELECT * FROM gene_info;&quot;)
 chromosomes &lt;- dbGetQuery(Hs_con, &quot;SELECT * FROM chromosomes;&quot;)</pre>
@@ -1139,16 +1119,16 @@ <h3><code>.GRP</code></h3>
 <pre class = 'prettyprint lang-r'>CD154_df &lt;- dbGetQuery(Hs_con, &quot;SELECT * FROM alias a JOIN gene_info g ON g._id = a._id WHERE a.alias_symbol LIKE &#39;CD154&#39;;&quot;)
 gc()</pre>
 
-<pre >##           used (Mb) gc trigger  (Mb) max used (Mb)
-## Ncells 1290972 69.0    3288291 175.7  1628720   87
-## Vcells 2112441 16.2    5187496  39.6  3004340   23</pre>
+<pre >##            used  (Mb) gc trigger  (Mb) max used  (Mb)
+## Ncells  1989679 106.3    4643607 248.0  4276538 228.4
+## Vcells 16674744 127.3   47075374 359.2 47071241 359.2</pre>
 
 <pre class = 'prettyprint lang-r'>CD40LG_alias_df &lt;- dbGetQuery(Hs_con, &quot;SELECT * FROM alias a JOIN gene_info g ON g._id = a._id WHERE g.symbol LIKE &#39;CD40LG&#39;;&quot;)
 gc()</pre>
 
-<pre >##           used (Mb) gc trigger  (Mb) max used (Mb)
-## Ncells 1290989 69.0    3288291 175.7  1628720   87
-## Vcells 2112537 16.2    5187496  39.6  3004340   23</pre>
+<pre >##            used  (Mb) gc trigger  (Mb) max used  (Mb)
+## Ncells  1989696 106.3    4643607 248.0  4276538 228.4
+## Vcells 16674841 127.3   47075374 359.2 47071241 359.2</pre>
 
 </article></slide><slide class=''><hgroup><h2>Some SQL Commands</h2></hgroup><article  id="some-sql-commands" class="smaller ">
 
 
@@ -1102,6 +1102,27 @@ DT[, `:=`(Name, NULL)]
 ## 3:      4
 ```
 
+## Listing all tables
+
+With data.table you can always list the tables that you've created, which will also return basic information on this tables including size, keys, nrows, etc.
+
+
+```r
+tables()
+```
+
+```
+##      NAME        NROW NCOL MB COLS         KEY
+## [1,] big_dt 1,000,000    3 20 x,y,z           
+## [2,] dt             3    3  1 x,y,z           
+## [3,] DT             3    2  1 Name,Salary     
+## [4,] DT1            5    4  1 x,y,z,newcol x  
+## [5,] DT2            1    3  1 x,y,w        x  
+## [6,] tmp1     456,976    3 32 x,y,z        x  
+## [7,] tmp2     456,976    3 32 x,y,z        x  
+## Total: 88MB
+```
+
 
 ## Bonuses: fread
 
@@ -1123,8 +1144,8 @@ microbenchmark(fread = fread(file), r.t = read.table(file, header = TRUE, sep =
 ```
 ## Unit: milliseconds
 ##   expr       min        lq      mean    median        uq       max neval
-##  fread  310.5437  310.5437  310.5437  310.5437  310.5437  310.5437     1
-##    r.t 7050.3093 7050.3093 7050.3093 7050.3093 7050.3093 7050.3093     1
+##  fread  331.6005  331.6005  331.6005  331.6005  331.6005  331.6005     1
+##    r.t 7447.6770 7447.6770 7447.6770 7447.6770 7447.6770 7447.6770     1
 ```
 
 ```r
@@ -1153,12 +1174,12 @@ microbenchmark(DT = rbindlist(dfs), DF = do.call(rbind, dfs), times = 5)
 
 ```
 ## Unit: milliseconds
-##  expr        min         lq       mean    median         uq       max
-##    DT   5.981805   8.551772   8.997102   9.64301   9.767579  11.04134
-##    DF 709.929230 843.444154 869.686238 884.76882 954.812659 955.47633
-##  neval cld
-##      5  a 
-##      5   b
+##  expr       min        lq      mean     median         uq       max neval
+##    DT  10.02123  10.46328   10.7023   10.62325   10.65706   11.7467     5
+##    DF 787.99593 984.21012 1088.7448 1060.43489 1142.37993 1468.7029     5
+##  cld
+##   a 
+##    b
 ```
 
 ## Summary
@@ -1222,46 +1243,6 @@ require(org.Hs.eg.db) || biocLite("org.Hs.eg.db")
 ```r
 # Now we can use the org.Hs.eg.db to load a database
 library(org.Hs.eg.db)
-```
-
-```
-## Loading required package: AnnotationDbi
-## Loading required package: BiocGenerics
-## Loading required package: parallel
-## 
-## Attaching package: 'BiocGenerics'
-## 
-## The following objects are masked from 'package:parallel':
-## 
-##     clusterApply, clusterApplyLB, clusterCall, clusterEvalQ,
-##     clusterExport, clusterMap, parApply, parCapply, parLapply,
-##     parLapplyLB, parRapply, parSapply, parSapplyLB
-## 
-## The following object is masked from 'package:stats':
-## 
-##     xtabs
-## 
-## The following objects are masked from 'package:base':
-## 
-##     anyDuplicated, append, as.data.frame, as.vector, cbind,
-##     colnames, do.call, duplicated, eval, evalq, Filter, Find, get,
-##     intersect, is.unsorted, lapply, Map, mapply, match, mget,
-##     order, paste, pmax, pmax.int, pmin, pmin.int, Position, rank,
-##     rbind, Reduce, rep.int, rownames, sapply, setdiff, sort,
-##     table, tapply, union, unique, unlist
-## 
-## Loading required package: Biobase
-## Welcome to Bioconductor
-## 
-##     Vignettes contain introductory material; view with
-##     'browseVignettes()'. To cite Bioconductor, see
-##     'citation("Biobase")', and for packages 'citation("pkgname")'.
-## 
-## Loading required package: GenomeInfoDb
-## Loading required package: DBI
-```
-
-```r
 # Create a connection
 Hs_con <- org.Hs.eg_dbconn()
 ```
@@ -1327,9 +1308,9 @@ gc()
 ```
 
 ```
-##           used (Mb) gc trigger  (Mb) max used (Mb)
-## Ncells 1141630 61.0    2320208 124.0  1628720 87.0
-## Vcells 1261314  9.7    3648212  27.9  1986142 15.2
+##            used  (Mb) gc trigger  (Mb) max used  (Mb)
+## Ncells  1840419  98.3    4643607 248.0  4276538 228.4
+## Vcells 15889273 121.3   47075374 359.2 47071241 359.2
 ```
 
 ```r
@@ -1338,9 +1319,9 @@ gc()
 ```
 
 ```
-##           used (Mb) gc trigger  (Mb) max used (Mb)
-## Ncells 1245119 66.5    3288291 175.7  1628720 87.0
-## Vcells 1625283 12.4    3648212  27.9  2370850 18.1
+##            used  (Mb) gc trigger  (Mb) max used  (Mb)
+## Ncells  1943826 103.9    4643607 248.0  4276538 228.4
+## Vcells 16187585 123.6   47075374 359.2 47071241 359.2
 ```
 
 ```r
@@ -1357,9 +1338,9 @@ gc()
 ```
 
 ```
-##           used (Mb) gc trigger  (Mb) max used (Mb)
-## Ncells 1290972 69.0    3288291 175.7  1628720   87
-## Vcells 2112441 16.2    5187496  39.6  3004340   23
+##            used  (Mb) gc trigger  (Mb) max used  (Mb)
+## Ncells  1989679 106.3    4643607 248.0  4276538 228.4
+## Vcells 16674744 127.3   47075374 359.2 47071241 359.2
 ```
 
 ```r
@@ -1368,9 +1349,9 @@ gc()
 ```
 
 ```
-##           used (Mb) gc trigger  (Mb) max used (Mb)
-## Ncells 1290989 69.0    3288291 175.7  1628720   87
-## Vcells 2112537 16.2    5187496  39.6  3004340   23
+##            used  (Mb) gc trigger  (Mb) max used  (Mb)
+## Ncells  1989696 106.3    4643607 248.0  4276538 228.4
+## Vcells 16674841 127.3   47075374 359.2 47071241 359.2
 ```
 
 ## Some SQL Commands
 
@@ -2,7 +2,7 @@
 title: 'Bioinformatics for Big Omics Data: Introduction to Bioconductor
 '
 author: "Raphael Gottardo"
-date: "January 15, 2014"
+date: "January 21, 2014"
 output:
   ioslides_presentation:
     fig_caption: yes
@@ -169,11 +169,12 @@ head(res)
 ## Finding specific data
 
 From previous table:
-bioc_package = bioconductor package
-hu6800 = Affymetrix HuGeneFL Genome Array annotation data (chip hu6800) 
-rgu34a = Affymetrix Rat Genome U34 Set annotation data (chip rgu34a)
 
-title = data set title or study title
+- bioc_package = bioconductor package
+- hu6800 = Affymetrix HuGeneFL Genome Array annotation data (chip hu6800) 
+- rgu34a = Affymetrix Rat Genome U34 Set annotation data (chip rgu34a)
+- title = data set title or study title
+
 For example BM_CD34-1a = bone marrow flow-sorted CD34+ cells (>95% purity) and has GSM sample number GSM575. 
 
 ## Getting the data we want
@@ -239,6 +240,7 @@ library(Biobase)
 showMethods(class="eSet")
 ```
 in particular, the following methods are rather convenient:
+
 - assayData(obj); assayData(obj) `<-` value: access or assign assayData
 - phenoData(obj); phenoData(obj) `<-` value: access or assign phenoData
 - experimentData(obj); experimentData(obj) `<-` value: access or assign experimentData