Skip to content

Commit

Permalink
CSV: recognize pipe separator and .psv extension for read (OSGeo#6901) (
Browse files Browse the repository at this point in the history
fixes OSGeo#6811)
  • Loading branch information
mdsumner committed Dec 12, 2022
1 parent 4804567 commit e8b2530
Show file tree
Hide file tree
Showing 5 changed files with 43 additions and 9 deletions.
21 changes: 21 additions & 0 deletions autotest/ogr/ogr_csv.py
Original file line number Diff line number Diff line change
Expand Up @@ -2750,3 +2750,24 @@ def test_ogr_csv_iter_and_set_feature():
gdal.Unlink("/vsimem/ogr_csv_iter_and_set_feature.csv")

assert count == 2


###############################################################################


def test_ogr_csv_pipe_separated():
gdal.FileFromMemBuffer(
"/vsimem/test_ogr_csv_pipe_separated.psv",
"""id|str
1|foo
""",
)

ds = gdal.OpenEx("/vsimem/test_ogr_csv_pipe_separated.psv")
lyr = ds.GetLayer(0)
f = lyr.GetNextFeature()
assert f["id"] == "1"
assert f["str"] == "foo"
ds = None

gdal.Unlink("/vsimem/test_ogr_csv_pipe_separated.psv")
6 changes: 4 additions & 2 deletions doc/source/drivers/vector/csv.rst
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,8 @@ point to a directory. For a directory to be recognised as a .csv
datasource at least half the files in the directory need to have the
extension .csv. One layer (table) is produced from each .csv file
accessed.
Starting with GDAL 3.7, pipe separated values files with .psv extension
are also recognized.

For files structured as CSV, but not ending
with .CSV extension, the 'CSV:' prefix can be added before the filename
Expand Down Expand Up @@ -77,8 +79,8 @@ CSV files have one line for each feature (record) in the layer (table).
The attribute field values are separated by commas. At least two fields
per line must be present. Lines may be terminated by a DOS (CR/LF) or
Unix (LF) style line terminators. Each record should have the same
number of fields. The driver will also accept a semicolon, a tabulation
or a space character as field separator .
number of fields. The driver will also accept a semicolon, a tabulation,
a pipe, or a space character as field separator .
This autodetection will work only if there's no other potential
separator on the first line of the CSV file. Otherwise it will default
to comma as separator.
Expand Down
12 changes: 11 additions & 1 deletion ogr/ogrsf_frmts/csv/ogrcsvdatasource.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -478,6 +478,9 @@ CPLString OGRCSVDataSource::GetRealExtension(CPLString osFilename)
else if( osFilename.size() > 7 &&
EQUAL(osFilename + osFilename.size() - 7, ".tsv.gz") )
return "tsv";
else if( osFilename.size() > 7 &&
EQUAL(osFilename + osFilename.size() - 7, ".psv.gz") )
return "psv";
}
return osExt;
}
Expand Down Expand Up @@ -570,7 +573,8 @@ int OGRCSVDataSource::Open( const char *pszFilename, int bUpdateIn,

// Is this a single CSV file?
if( VSI_ISREG(sStatBuf.st_mode)
&& (bIgnoreExtension || EQUAL(osExt, "csv") || EQUAL(osExt, "tsv")) )
&& (bIgnoreExtension || EQUAL(osExt, "csv") || EQUAL(osExt, "tsv") ||
EQUAL(osExt, "psv")) )
{
if (EQUAL(CPLGetFilename(osFilename), "NfdcFacilities.xls"))
{
Expand Down Expand Up @@ -751,6 +755,12 @@ bool OGRCSVDataSource::OpenTable( const char *pszFilename,
osLayerName = osLayerName.substr(0, osLayerName.size() - 4);
osExt = "tsv";
}
else if( strlen(pszFilename) > 7 &&
EQUAL(pszFilename + strlen(pszFilename) - 7, ".psv.gz") )
{
osLayerName = osLayerName.substr(0, osLayerName.size() - 4);
osExt = "psv";
}
}

int nMaxLineSize = atoi(CPLGetConfigOption(
Expand Down
4 changes: 2 additions & 2 deletions ogr/ogrsf_frmts/csv/ogrcsvdriver.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -94,7 +94,7 @@ static int OGRCSVDriverIdentify( GDALOpenInfo *poOpenInfo )
{
return TRUE;
}
else if( EQUAL(osExt, "csv") || EQUAL(osExt, "tsv") )
else if( EQUAL(osExt, "csv") || EQUAL(osExt, "tsv") || EQUAL(osExt, "psv") )
{
return TRUE;
}
Expand Down Expand Up @@ -313,7 +313,7 @@ void RegisterOGRCSV()

poDriver->SetMetadataItem(GDAL_DMD_LONGNAME,
"Comma Separated Value (.csv)");
poDriver->SetMetadataItem(GDAL_DMD_EXTENSION, "csv");
poDriver->SetMetadataItem(GDAL_DMD_EXTENSIONS, "csv tsv psv");
poDriver->SetMetadataItem(GDAL_DMD_HELPTOPIC, "drivers/vector/csv.html");
poDriver->SetMetadataItem( GDAL_DMD_SUPPORTED_SQL_DIALECTS, "OGRSQL SQLITE" );

Expand Down
9 changes: 5 additions & 4 deletions port/cpl_csv.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -523,10 +523,11 @@ static void CSVIngest( const char *pszFilename )

/** Detect which field separator is used.
*
* Currently, it can detect comma, semicolon, space or tabulation. In case of
* ambiguity or no separator found, comma will be considered as the separator.
* Currently, it can detect comma, semicolon, space, tabulation or pipe.
* In case of ambiguity or no separator found, comma will be considered as the
* separator.
*
* @return ',', ';', ' ' or tabulation character.
* @return ',', ';', ' ', tabulation character or '|'.
*/
char CSVDetectSeperator( const char* pszLine )
{
Expand All @@ -537,7 +538,7 @@ char CSVDetectSeperator( const char* pszLine )
for( ; *pszLine != '\0'; pszLine++ )
{
if( !bInString && ( *pszLine == ',' || *pszLine == ';'
|| *pszLine == '\t'))
|| *pszLine == '\t' || *pszLine == '|'))
{
if( chDelimiter == '\0' )
{
Expand Down

0 comments on commit e8b2530

Please sign in to comment.