Read file by extension into R.
Usage
import(con, format, text, ...)
# S4 method for character,missing,missing
import(con, format, text, ...)
# S4 method for character,character,missing
import(con, format, text, ...)
# S4 method for PipetteRDSFile,missing,missing
import(con, format, text, quiet = getOption(x = "acid.quiet", default = FALSE))
# S4 method for PipetteRDataFile,missing,missing
import(con, format, text, quiet = getOption(x = "acid.quiet", default = FALSE))
# S4 method for PipetteDelimFile,missing,missing
import(
con,
format,
text,
rownames = TRUE,
rownameCol = NULL,
colnames = TRUE,
quote = "\"",
comment = "",
skip = 0L,
nMax = Inf,
engine = getOption(x = "acid.import.engine", default = "base"),
makeNames = getOption(x = "acid.import.make.names", default = syntactic::makeNames),
metadata = getOption(x = "acid.import.metadata", default = FALSE),
quiet = getOption(x = "acid.quiet", default = FALSE),
verbose = getOption(x = "acid.verbose", default = FALSE)
)
# S4 method for PipetteLinesFile,missing,missing
import(
con,
format,
text,
comment = "",
skip = 0L,
nMax = Inf,
stripWhitespace = FALSE,
removeBlank = FALSE,
metadata = getOption(x = "acid.import.metadata", default = FALSE),
engine = getOption(x = "acid.import.engine", default = "base"),
quiet = getOption(x = "acid.quiet", default = FALSE),
verbose = getOption(x = "acid.verbose", default = FALSE)
)
# S4 method for PipetteExcelFile,missing,missing
import(
con,
format,
text,
sheet = 1L,
rownames = TRUE,
rownameCol = NULL,
colnames = TRUE,
skip = 0L,
nMax = Inf,
makeNames = getOption(x = "acid.import.make.names", default = syntactic::makeNames),
metadata = getOption(x = "acid.import.metadata", default = FALSE),
quiet = getOption(x = "acid.quiet", default = FALSE)
)
# S4 method for PipetteBAMFile,missing,missing
import(con, format, text, quiet = getOption(x = "acid.quiet", default = FALSE))
# S4 method for PipetteBCFFile,missing,missing
import(con, format, text, quiet = getOption(x = "acid.quiet", default = FALSE))
# S4 method for PipetteCRAMFile,missing,missing
import(con, format, text, quiet = getOption(x = "acid.quiet", default = FALSE))
# S4 method for PipetteFASTAFile,missing,missing
import(
con,
format,
text,
moleculeType = c("DNA", "RNA", "AA"),
metadata = getOption(x = "acid.import.metadata", default = FALSE),
quiet = getOption(x = "acid.quiet", default = FALSE)
)
# S4 method for PipetteFASTQFile,missing,missing
import(
con,
format,
text,
moleculeType = c("DNA", "RNA"),
metadata = getOption(x = "acid.import.metadata", default = FALSE),
quiet = getOption(x = "acid.quiet", default = FALSE)
)
# S4 method for PipetteGCTFile,missing,missing
import(
con,
format,
text,
metadata = getOption(x = "acid.import.metadata", default = FALSE),
quiet = getOption(x = "acid.quiet", default = FALSE),
return = c("matrix", "data.frame")
)
# S4 method for PipetteGMTFile,missing,missing
import(con, format, text, quiet = getOption(x = "acid.quiet", default = FALSE))
# S4 method for PipetteGMXFile,missing,missing
import(con, format, text, quiet = getOption(x = "acid.quiet", default = FALSE))
# S4 method for PipetteGRPFile,missing,missing
import(con, format, text, quiet = getOption(x = "acid.quiet", default = FALSE))
# S4 method for PipetteJSONFile,missing,missing
import(
con,
format,
text,
metadata = getOption(x = "acid.import.metadata", default = FALSE),
quiet = getOption(x = "acid.quiet", default = FALSE)
)
# S4 method for PipetteMAFFile,missing,missing
import(con, format, text, quiet = getOption(x = "acid.quiet", default = FALSE))
# S4 method for PipetteMTXFile,missing,missing
import(
con,
format,
text,
rownamesFile,
colnamesFile,
metadata = getOption(x = "acid.import.metadata", default = FALSE),
quiet = getOption(x = "acid.quiet", default = FALSE)
)
# S4 method for PipetteOBOFile,missing,missing
import(con, format, text, quiet = getOption(x = "acid.quiet", default = FALSE))
# S4 method for PipettePZFXFile,missing,missing
import(
con,
format,
text,
sheet = 1L,
makeNames = getOption(x = "acid.import.make.names", default = syntactic::makeNames),
metadata = getOption(x = "acid.import.metadata", default = FALSE),
quiet = getOption(x = "acid.quiet", default = FALSE)
)
# S4 method for PipetteSAMFile,missing,missing
import(con, format, text, quiet = getOption(x = "acid.quiet", default = FALSE))
# S4 method for PipetteVCFFile,missing,missing
import(con, format, text, quiet = getOption(x = "acid.quiet", default = FALSE))
# S4 method for PipetteYAMLFile,missing,missing
import(
con,
format,
text,
metadata = getOption(x = "acid.import.metadata", default = FALSE),
quiet = getOption(x = "acid.quiet", default = FALSE)
)
# S4 method for PipetteBcbioCountsFile,missing,missing
import(
con,
format,
text,
metadata = getOption(x = "acid.import.metadata", default = FALSE),
quiet = getOption(x = "acid.quiet", default = FALSE)
)
# S4 method for PipetteRioFile,missing,missing
import(
con,
format,
text,
rownames = TRUE,
rownameCol = NULL,
colnames = TRUE,
makeNames = getOption(x = "acid.import.make.names", default = syntactic::makeNames),
metadata = getOption(x = "acid.import.metadata", default = FALSE),
quiet = getOption(x = "acid.quiet", default = FALSE),
...
)
# S4 method for PipetteRtracklayerFile,missing,missing
import(
con,
format,
text,
metadata = getOption(x = "acid.import.metadata", default = FALSE),
quiet = getOption(x = "acid.quiet", default = FALSE),
...
)
# S4 method for textConnection,character,missing
import(
con,
format = c("csv", "tsv"),
text,
colnames = TRUE,
quote = "\"",
quiet = getOption(x = "acid.quiet", default = FALSE)
)
Arguments
- con
character(1)
,connection
, ormissing
. The connection from which data is loaded or to which data is saved. If this is a character vector, it is assumed to be a filename, and a corresponding file connection is created and then closed after exporting the object. If aBiocFile
derivative, the data is loaded from or saved to the underlying resource. If missing, the function will return the output as a character vector, rather than writing to a connection.- format
character(1)
ormissing
. An optional file format type, which can be used to override the file format inferred fromcon
. Only recommended for file and URL paths that don't contain an extension.- text
character
ormissing
. Not currently supported.- ...
Additional arguments.
- quiet
logical(1)
. Perform command quietly, suppressing messages.- rownames
logical(1)
. Automatically assign row names, ifrowname
column is defined. Applies to file types that return a data frame only.- rownameCol
NULL
,character(1)
, orinteger(1)
. Applies only whenrownames = TRUE
. Column name to use for row names assignment. If leftNULL
(default), the function will callmatchRownameCol()
internally to attempt to automatically match the row name column (e.g."rowname"
or"rn"
). Otherwise, can manually define using a scalar argument, either the name directly or position in the column names.- colnames
logical(1)
orcharacter
. Automatically assign column names, using the first header row. Applies to file types that returndata.frame
only. Pass in acharacter
vector to define the column names manually.- quote
character(1)
. The set of quoting characters. To disable quoting altogether, usequote = ""
(not generally recommended). Applies to plain text delimited files only.- comment
character(1)
. Comment character to detect at beginning of line, which will skip when parsing file. Use""
to disable interpretation of comments, which is particularly useful when parsing lines. Applies to plain text delimited and source code lines only.- skip
integer(1)
. Number of lines to skip. Applies to delimited file (CSV, TSV), Excel Workbook, or lines.- nMax
integer(1)
orInf
. Maximum number of lines to parse. Applies to plain text delimited, Excel, and source code lines only.- engine
character(1)
. Engine (package) to use for import.Currently supported:
base
data.table
readr
- makeNames
function
. Apply syntactic naming function to (column) names. Function is never applied to row names, when they are defined in object.- metadata
list
. Metadata.- verbose
logical(1)
. Run the function with verbose output.- stripWhitespace
logical(1)
. Strip leading and/or trailing whitespace. Applies to source code lines.- removeBlank
logical(1)
. Remove blank lines. Applies to source code lines.- sheet
character(1)
orinteger(1)
. Sheet to read. Either a string (the name of a sheet), or an integer (the position of the sheet). Defaults to the first sheet. Applies to Excel Workbook, Google Sheet, or GraphPad Prism file.- moleculeType
character(1)
. Molecule type, either DNA or RNA. Most RNA-seq FASTQ files contain complementary DNA (cDNA) sequences, not direct sequencing of the RNA molecules.- return
character(1)
. Object class to return.- rownamesFile, colnamesFile
character(1)
orNULL
. Row names and/or column names sidecare file. Applies primarily to MatrixMarket Exchange files (e.g.MTXFile
).
Value
Varies, depending on the file type (format):
R data serialized (
RDS
): variable.
Currently recommend over RDA, if possible.
Imported byreadRDS()
.R data (
RDA
,RDATA
): variable.
Must contain a single object. Doesn't require internal object name to match, unlikeloadData()
.
Imported byload()
.Plain text delimited (
CSV
,TSV
,TXT
):data.frame
.
Data separated by commas, tabs, or visual spaces.
Note that TXT structure is amgibuous and actively discouraged.
Refer toData frame return
section for details on how to change the default return type toDFrame
,tbl_df
ordata.table
.
Imported byreadr::read_delim()
by default.Excel workbook (
XLSB
,XLSX
):data.frame
.
Resave in plain text delimited format instead, if possible.
Imported byreadxl::read_excel()
.Legacy Excel workbook (pre-2007) (
XLS
):data.frame
.
Resave in plain text delimited format instead, if possible.
Note that import of files in this format is slow.
Imported byreadxl::read_excel()
.GraphPad Prism project (
PZFX
):data.frame
.
Experimental. Consider resaving in CSV format instead.
Imported bypzfx::read_pzfx()
.General feature format (
GFF
,GFF1
,GFF2
,GFF3
,GTF
):GRanges
.
Imported byrtracklayer::import()
.MatrixMarket exchange sparse matrix (
MTX
):sparseMatrix
.
Imported byMatrix::readMM()
.**Sequence alignment/map format (
SAM
,BAM
,CRAM
):list
.
Imported byRsamtools::scanBam
.Mutation annotation format (
MAF
):MAF
.
Imported bymaftools::read.maf()
.Variant annotation format (
VCF
,BCF
):list
.
Imported byRsamtools::scanBcf
.Gene cluster text (
GCT
):matrix
ordata.frame
.
Imported byreadr::read_delim()
.Gene sets (for GSEA) (
GMT
,GMX
):character
.Browser extensible data (
BED
,BED15
,BEDGRAPH
,BEDPE
):GRanges
.
Imported byrtracklayer::import()
.ChIP-seq peaks (
BROADPEAK
,NARROWPEAK
):GRanges
.
Imported byrtracklayer::import()
.Wiggle track format (
BIGWIG
,BW
,WIG
):GRanges
.
Imported byrtracklayer::import()
.JSON serialization data (
JSON
):list
.
Imported byjsonlite::read_json()
.YAML serialization data (
YAML
,YML
):list
.
Imported byyaml::yaml.load_file()
.Lines (
LOG
,MD
,PY
,R
,RMD
,SH
):character
.
Source code or log files.
Imported byreadr::read_delim()
by default.Infrequently used rio-compatible formats (
ARFF
,DBF
,DIF
,DTA
,MAT
,MTP
,ODS
,POR
,SAS7BDAT
,SAV
,SYD
,REC
,XPT
): variable.
Imported byrio::import()
.
Details
import()
supports automatic loading of common file types, by wrapping
popular importer functions. It intentionally designed to be simple, with few
arguments. Remote URLs and compressed files are supported. If you need more
complex import settings, just call the wrapped importer directly instead.
Row and column names
Row names. Row name handling has become an inconsistent mess in R because
of differential support in base R, tidyverse, data.table, and Bioconductor.
To maintain sanity, import()
attempts to handle row names automatically.
The function checks for a rowname
column in delimited data, and moves these
values into the object's row names, if supported by the return type (e.g.
data.frame
, DFrame
). Note that tbl_df
(tibble) and data.table
intentionally do not support row names. When returning in this format, no
attempt to assign the rowname
column into the return object's row names is
made. Note that import()
is strict about this matching and only checks for
a rowname
column, similar to the default syntax recommended in
tibble::rownames_to_column()
. To disable this behavior, set rownames = FALSE
, and no attempt will be made to set the row names.
Column names. import()
assumes that delimited files always contain
column names. If you are working with a file that doesn't contain column
names, either set colnames = FALSE
or pass the names in as a character
vector. It's strongly recommended to always define column names in a
supported file type.
FASTA and FASTQ files
FASTA and FASTQ files are currently managed internally by the Biostrings
package. Refer to readDNAStringSet
and readRNAStringSet
for details.
Import of these files will return DNAStringSet
or RNAStringSet
depending
on the input, defined by moleculeType
argument.
General feature format (GFF, GTF)
The GFF (General Feature Format) format consists of one line per feature, each containing 9 columns of data, plus optional track definition lines. The GTF (General Transfer Format) is identical to GFF version 2.
See also:
Gene cluster text format (GCT)
Refer to the IGV GCT website for details.
GSEA gene set files
Refer to the Broad Institute GSEA wiki for details.
Matrix Market Exchange
Reading a Matrix Market Exchange file requires ROWNAMES
and COLNAMES
sidecar files containing the corresponding row and column names of the sparse
matrix.
bcbio-nextgen count matrix
bcbio count matrix (e.g. generated from featureCounts) and related sidecar files are natively supported.
COUNTS
: Counts table (e.g. RNA-seq aligned counts).COLNAMES
: Sidecar file containing column names.ROWNAMES
: Sidecar file containing row names.
Denylisted extensions
These file formats are intentionally not supported:
DOC
, DOCX
, PDF
, PPT
, PPTX
.
See also
Examples
con <- system.file("extdata", "example.csv", package = "pipette")
## Row and column names enabled.
x <- import(con = con)
#> → Importing /private/var/folders/l1/8y8sjzmn15v49jgrqglghcfr0000gn/T/RtmpAse1qU/temp_libpath1358973b737ad/pipette/extdata/example.csv using base::`read.table()`.
#> ℹ Setting row names from `rowname` column.
print(head(x))
#> sample1 sample2 sample3 sample4
#> gene1 16 20 13 16
#> gene2 29 22 43 50
#> gene3 243 245 186 184
#> gene4 7 14 25 16
#> gene5 1 1 2 2
#> gene6 10 17 18 11
## Row and column names disabled.
x <- import(con = con, rownames = FALSE, colnames = FALSE)
#> → Importing /private/var/folders/l1/8y8sjzmn15v49jgrqglghcfr0000gn/T/RtmpAse1qU/temp_libpath1358973b737ad/pipette/extdata/example.csv using base::`read.table()`.
print(head(x))
#> V1 V2 V3 V4 V5
#> 1 rowname sample1 sample2 sample3 sample4
#> 2 gene1 16 20 13 16
#> 3 gene2 29 22 43 50
#> 4 gene3 243 245 186 184
#> 5 gene4 7 14 25 16
#> 6 gene5 1 1 2 2