diff --git a/cmds/extract_cmd.go b/cmds/extract_cmd.go index 837b2fe0..5dfac0cc 100644 --- a/cmds/extract_cmd.go +++ b/cmds/extract_cmd.go @@ -92,20 +92,25 @@ func (cmd *ExtractCommand) AddFlags(fl *pflag.FlagSet) { fl.StringArrayVar(&cmd.extensionDefs, "ext", nil, "Include GTFS Extension") fl.IntVar(&cmd.fvid, "fvid", 0, "Specify FeedVersionID when writing to a database") fl.BoolVar(&cmd.create, "create", false, "Create a basic database schema if none exists") + fl.BoolVar(&cmd.writeExtraColumns, "write-extra-columns", false, "Include extra columns in output") + // Copy options - fl.Float64Var(&cmd.SimplifyShapes, "simplify-shapes", 0.0, "Simplify shapes with this tolerance (ex. 0.000005)") fl.BoolVar(&cmd.AllowEntityErrors, "allow-entity-errors", false, "Allow entities with errors to be copied") fl.IntVar(&cmd.Options.ErrorLimit, "error-limit", 10, "Max number of detailed errors per error group") - fl.BoolVar(&cmd.AllowReferenceErrors, "allow-reference-errors", false, "Allow entities with reference errors to be copied") - fl.BoolVar(&cmd.InterpolateStopTimes, "interpolate-stop-times", false, "Interpolate missing StopTime arrival/departure values") - fl.BoolVar(&cmd.CreateMissingShapes, "create-missing-shapes", false, "Create missing Shapes from Trip stop-to-stop geometries") - fl.BoolVar(&cmd.NormalizeServiceIDs, "normalize-service-ids", false, "Create any missing Calendar entities for CalendarDate service_id's") + fl.BoolVar(&cmd.Options.AllowReferenceErrors, "allow-reference-errors", false, "Allow entities with reference errors to be copied") + fl.BoolVar(&cmd.Options.InterpolateStopTimes, "interpolate-stop-times", false, "Interpolate missing StopTime arrival/departure values") + fl.BoolVar(&cmd.Options.CreateMissingShapes, "create-missing-shapes", false, "Create missing Shapes from Trip stop-to-stop geometries") + fl.BoolVar(&cmd.Options.NormalizeServiceIDs, "normalize-service-ids", false, "Create any missing Calendar entities for CalendarDate service_id's") fl.BoolVar(&cmd.Options.DeduplicateJourneyPatterns, "deduplicate-stop-times", false, "Deduplicate StopTimes using Journey Patterns") - fl.BoolVar(&cmd.SimplifyCalendars, "simplify-calendars", false, "Attempt to simplify CalendarDates into regular Calendars") - fl.BoolVar(&cmd.Options.NormalizeTimezones, "normalize-timezones", false, "Normalize timezones and apply default stop timezones based on agency and parent stops") - fl.BoolVar(&cmd.UseBasicRouteTypes, "use-basic-route-types", false, "Collapse extended route_type's into basic GTFS values") - fl.BoolVar(&cmd.CopyExtraFiles, "write-extra-files", false, "Copy additional files found in source to destination") - fl.BoolVar(&cmd.writeExtraColumns, "write-extra-columns", false, "Include extra columns in output") + fl.BoolVar(&cmd.Options.SimplifyCalendars, "simplify-calendars", false, "Attempt to simplify CalendarDates into regular Calendars") + fl.BoolVar(&cmd.Options.CopyExtraFiles, "write-extra-files", false, "Copy additional files found in source to destination") + + // Common extension options + fl.Float64Var(&cmd.Options.CommonExtensions.SimplifyShapes, "simplify-shapes", 0.0, "Simplify shapes with this tolerance (ex. 0.000005)") + fl.BoolVar(&cmd.Options.CommonExtensions.NormalizeTimezones, "normalize-timezones", false, "Normalize timezones and apply default stop timezones based on agency and parent stops") + fl.BoolVar(&cmd.Options.CommonExtensions.UseBasicRouteTypes, "use-basic-route-types", false, "Collapse extended route_type's into basic GTFS values") + fl.Float64Var(&cmd.Options.CommonExtensions.ShapeMaxSegmentLength, "shape-max-segment-length", 0.0, "Maximum shape segment length in meters (0.0 to disable check)") + fl.BoolVar(&cmd.Options.CommonExtensions.NullIslandCheck, "null-island-check", false, "Check for Null Island in shapes.txt and stops.txt") // Extract options fl.StringArrayVar(&cmd.extractAgencies, "extract-agency", nil, "Extract Agency") diff --git a/cmds/import_cmd.go b/cmds/import_cmd.go index 64e246e9..f07438e5 100644 --- a/cmds/import_cmd.go +++ b/cmds/import_cmd.go @@ -66,13 +66,19 @@ func (cmd *ImportCommand) AddFlags(fl *pflag.FlagSet) { fl.BoolVar(&cmd.Latest, "latest", false, "Only import latest feed version available for each feed") fl.BoolVar(&cmd.DryRun, "dryrun", false, "Dry run; print feeds that would be imported and exit") fl.BoolVar(&cmd.Options.Activate, "activate", false, "Set as active feed version after import") + // Copy options - fl.Float64Var(&cmd.Options.SimplifyShapes, "simplify-shapes", 0.0, "Simplify shapes with this tolerance (ex. 0.000005)") fl.BoolVar(&cmd.Options.InterpolateStopTimes, "interpolate-stop-times", false, "Interpolate missing StopTime arrival/departure values") fl.BoolVar(&cmd.Options.DeduplicateJourneyPatterns, "deduplicate-stop-times", false, "Deduplicate StopTimes using Journey Patterns") fl.BoolVar(&cmd.Options.CreateMissingShapes, "create-missing-shapes", false, "Create missing Shapes from Trip stop-to-stop geometries") fl.BoolVar(&cmd.Options.SimplifyCalendars, "simplify-calendars", false, "Attempt to simplify CalendarDates into regular Calendars") - fl.BoolVar(&cmd.Options.NormalizeTimezones, "normalize-timezones", false, "Normalize timezones and apply default stop timezones based on agency and parent stops") + + // Common extension options + fl.Float64Var(&cmd.Options.CommonExtensions.SimplifyShapes, "simplify-shapes", 0.0, "Simplify shapes with this tolerance (ex. 0.000005)") + fl.BoolVar(&cmd.Options.CommonExtensions.NormalizeTimezones, "normalize-timezones", false, "Normalize timezones and apply default stop timezones based on agency and parent stops") + fl.BoolVar(&cmd.Options.CommonExtensions.UseBasicRouteTypes, "use-basic-route-types", false, "Collapse extended route_type's into basic GTFS values") + fl.Float64Var(&cmd.Options.CommonExtensions.ShapeMaxSegmentLength, "shape-max-segment-length", 0.0, "Maximum shape segment length in meters (0.0 to disable check)") + fl.BoolVar(&cmd.Options.CommonExtensions.NullIslandCheck, "null-island-check", false, "Check for Null Island in shapes.txt and stops.txt") } // Parse command line flags diff --git a/copier/copier.go b/copier/copier.go index 1ba4d871..f1b38289 100644 --- a/copier/copier.go +++ b/copier/copier.go @@ -16,11 +16,9 @@ import ( "github.com/interline-io/log" "github.com/interline-io/transitland-lib/adapters" "github.com/interline-io/transitland-lib/causes" - "github.com/interline-io/transitland-lib/ext" "github.com/interline-io/transitland-lib/filters" "github.com/interline-io/transitland-lib/gtfs" "github.com/interline-io/transitland-lib/internal/geomcache" - "github.com/interline-io/transitland-lib/rules" "github.com/interline-io/transitland-lib/service" "github.com/interline-io/transitland-lib/tlcsv" "github.com/interline-io/transitland-lib/tlxy" @@ -95,80 +93,6 @@ type hasLine interface { ////////// Copier ////////// //////////////////////////// -// Options defines the settable options for a Copier. -type Options struct { - // Batch size - BatchSize int - // Skip most validation filters - NoValidators bool - // Skip shape cache - NoShapeCache bool - // Attempt to save an entity that returns validation errors - AllowEntityErrors bool - AllowReferenceErrors bool - // Interpolate any missing StopTime values: ArrivalTime/DepartureTime/ShapeDistTraveled - InterpolateStopTimes bool - // Create a stop-to-stop Shape for Trips without a ShapeID. - CreateMissingShapes bool - // Create missing Calendar entries - NormalizeServiceIDs bool - // Normalize timezones, e.g. US/Pacific -> America/Los_Angeles - NormalizeTimezones bool - // Simplify Calendars that use mostly CalendarDates - SimplifyCalendars bool - // Convert extended route types to primitives - UseBasicRouteTypes bool - // Copy extra files (requires CSV input) - CopyExtraFiles bool - // Simplify shapes - SimplifyShapes float64 - // Convert route network_id to networks.txt/route_networks.txt - NormalizeNetworks bool - // DeduplicateStopTimes - DeduplicateJourneyPatterns bool - // Error limit - ErrorLimit int - // Logging level - Quiet bool - // Default error handler - ErrorHandler ErrorHandler - // Entity selection strategy - Marker Marker - // Journey Pattern Key Function - JourneyPatternKey func(*gtfs.Trip) string - // Named extensions - ExtensionDefs []string - // Initialized extensions - exts []optionExtLevel -} - -type optionExtLevel struct { - ext any - level int -} - -func (opts *Options) AddExtension(ext any) { - opts.AddExtensionWithLevel(ext, 0) -} - -func (opts *Options) ParseExtensionDef(extDef string) (ext.Extension, error) { - extName, extArgs, err := ext.ParseExtensionArgs(extDef) - if err != nil { - return nil, err - } - e, err := ext.GetExtension(extName, extArgs) - if err != nil { - return nil, fmt.Errorf("error creating extension '%s' with args '%s': %s", extName, extArgs, err.Error()) - } else if e == nil { - return nil, fmt.Errorf("no registered extension for '%s'", extName) - } - return e, nil -} - -func (opts *Options) AddExtensionWithLevel(e any, level int) { - opts.exts = append(opts.exts, optionExtLevel{ext: e, level: level}) -} - //////////////////////////////////// // Copier //////////////////////////////////// @@ -275,62 +199,24 @@ func NewCopier(ctx context.Context, reader adapters.Reader, writer adapters.Writ } // Default set of validators - var addExts []any - addExts = append(addExts, copier.geomCache) - - // Minimal validators - if !opts.NoValidators { - addExts = append(addExts, - &rules.EntityDuplicateIDCheck{}, - &rules.EntityDuplicateKeyCheck{}, - &rules.ValidFarezoneCheck{}, - &rules.AgencyIDConditionallyRequiredCheck{}, - &rules.StopTimeSequenceCheck{}, - &rules.InconsistentTimezoneCheck{}, - &rules.ParentStationLocationTypeCheck{}, - &rules.CalendarDuplicateDates{}, - &rules.FareProductRiderCategoryDefaultCheck{}, - &rules.TransferStopLocationTypeCheck{}, - ) - } - - // Default extensions - if copier.options.UseBasicRouteTypes { - // Convert extended route types to basic route types - addExts = append(addExts, &filters.BasicRouteTypeFilter{}) - } - if copier.options.NormalizeTimezones { - // Normalize timezones and apply agency/stop timezones where empty - addExts = append(addExts, &filters.NormalizeTimezoneFilter{}) - addExts = append(addExts, &filters.ApplyParentTimezoneFilter{}) - } - if copier.options.SimplifyShapes > 0 { - // Simplify shapes.txt - addExts = append(addExts, &filters.SimplifyShapeFilter{SimplifyValue: copier.options.SimplifyShapes}) - } - if copier.options.NormalizeNetworks { - // Convert routes.txt network_id to networks.txt/route_networks.txt - addExts = append(addExts, &filters.RouteNetworkIDFilter{}) - } else { - addExts = append(addExts, &filters.RouteNetworkIDCompatFilter{}) - } - if copier.options.SimplifyCalendars && copier.options.NormalizeServiceIDs { - // Simplify calendar and calendar dates - addExts = append(addExts, &filters.SimplifyCalendarFilter{}) - } - - // Set default extension level to 0 + commonExts := copier.options.CommonExtensions.Extensions() var addExtLevels []optionExtLevel - for _, e := range addExts { + addExtLevels = append(addExtLevels, optionExtLevel{ext: copier.geomCache, level: 0}) + for _, e := range commonExts { addExtLevels = append(addExtLevels, optionExtLevel{ext: e, level: 0}) } + // Simplify calendar and calendar dates + if copier.options.SimplifyCalendars && copier.options.NormalizeServiceIDs { + addExtLevels = append(addExtLevels, optionExtLevel{ext: &filters.SimplifyCalendarFilter{}, level: 0}) + } + // Add Option extensions addExtLevels = append(addExtLevels, opts.exts...) // Parse option extension defs for _, extDef := range opts.ExtensionDefs { - e, err := opts.ParseExtensionDef(extDef) + e, err := ParseExtensionDef(extDef) if err != nil { return nil, fmt.Errorf("failed to parse extension: %s", err.Error()) } diff --git a/copier/opts.go b/copier/opts.go new file mode 100644 index 00000000..8ac04cb5 --- /dev/null +++ b/copier/opts.go @@ -0,0 +1,152 @@ +package copier + +import ( + "fmt" + + "github.com/interline-io/transitland-lib/ext" + "github.com/interline-io/transitland-lib/filters" + "github.com/interline-io/transitland-lib/gtfs" + "github.com/interline-io/transitland-lib/rules" +) + +type CommonExtensions struct { + // Skip most validation filters + NoValidators bool + // Normalize timezones, e.g. US/Pacific -> America/Los_Angeles + NormalizeTimezones bool + // Convert extended route types to primitives + UseBasicRouteTypes bool + // Simplify shapes + SimplifyShapes float64 + // Convert route network_id to networks.txt/route_networks.txt + NormalizeNetworks bool + // Maximum shape segment length in meters + ShapeMaxSegmentLength float64 + // Exclude stops and shapes with one or both zero coordinates + NullIslandCheck bool +} + +func (opts *CommonExtensions) Extensions() []any { + // Default set of validators + var addExts []any + + // Minimal validators + if !opts.NoValidators { + addExts = append(addExts, + &rules.EntityDuplicateIDCheck{}, + &rules.EntityDuplicateKeyCheck{}, + &rules.ValidFarezoneCheck{}, + &rules.AgencyIDConditionallyRequiredCheck{}, + &rules.StopTimeSequenceCheck{}, + &rules.InconsistentTimezoneCheck{}, + &rules.ParentStationLocationTypeCheck{}, + &rules.CalendarDuplicateDates{}, + &rules.FareProductRiderCategoryDefaultCheck{}, + &rules.TransferStopLocationTypeCheck{}, + ) + } + + // Optional rules that are best practices but can + // have a significant data quality impact + if opts.ShapeMaxSegmentLength > 0 { + // Check shape segment lengths + addExts = append(addExts, &rules.ShapeMaxSegmentLengthCheck{ + MaxAllowedDistance: opts.ShapeMaxSegmentLength, + }) + } + if opts.NullIslandCheck { + // Exclude stops with zero coordinates + addExts = append(addExts, &rules.NullIslandCheck{}) + } + + // Optional filters for common data transformations + if opts.UseBasicRouteTypes { + // Convert extended route types to basic route types + addExts = append(addExts, &filters.BasicRouteTypeFilter{}) + } + if opts.NormalizeTimezones { + // Normalize timezones and apply agency/stop timezones where empty + addExts = append(addExts, &filters.NormalizeTimezoneFilter{}) + addExts = append(addExts, &filters.ApplyParentTimezoneFilter{}) + } + if opts.SimplifyShapes > 0 { + // Simplify shapes.txt + addExts = append(addExts, &filters.SimplifyShapeFilter{ + SimplifyValue: opts.SimplifyShapes, + }) + } + if opts.NormalizeNetworks { + // Convert routes.txt network_id to networks.txt/route_networks.txt + addExts = append(addExts, &filters.RouteNetworkIDFilter{}) + } else { + addExts = append(addExts, &filters.RouteNetworkIDCompatFilter{}) + } + + return addExts +} + +// Options defines the settable options for a Copier. +type Options struct { + // Batch size + BatchSize int + // Skip shape cache + NoShapeCache bool + // Attempt to save an entity that returns validation errors + AllowEntityErrors bool + AllowReferenceErrors bool + // Interpolate any missing StopTime values: ArrivalTime/DepartureTime/ShapeDistTraveled + InterpolateStopTimes bool + // Create a stop-to-stop Shape for Trips without a ShapeID. + CreateMissingShapes bool + // Create missing Calendar entries + NormalizeServiceIDs bool + // Simplify Calendars that use mostly CalendarDates + SimplifyCalendars bool + // Copy extra files (requires CSV input) + CopyExtraFiles bool + // DeduplicateStopTimes + DeduplicateJourneyPatterns bool + // Error limit + ErrorLimit int + // Logging level + Quiet bool + // Default error handler + ErrorHandler ErrorHandler + // Entity selection strategy + Marker Marker + // Journey Pattern Key Function + JourneyPatternKey func(*gtfs.Trip) string + // Named extensions + ExtensionDefs []string + // Common extensions + CommonExtensions + // Initialized extensions + exts []optionExtLevel +} + +type optionExtLevel struct { + ext any + level int +} + +func (opts *Options) AddExtension(ext any) { + opts.AddExtensionWithLevel(ext, 0) +} + +func ParseExtensionDef(extDef string) (ext.Extension, error) { + extName, extArgs, err := ext.ParseExtensionArgs(extDef) + if err != nil { + return nil, err + } + e, err := ext.GetExtension(extName, extArgs) + if err != nil { + return nil, fmt.Errorf("error creating extension '%s' with args '%s': %s", extName, extArgs, err.Error()) + } else if e == nil { + return nil, fmt.Errorf("no registered extension for '%s'", extName) + } + return e, nil +} + +func (opts *Options) AddExtensionWithLevel(e any, level int) { + opts.exts = append(opts.exts, optionExtLevel{ext: e, level: level}) +} diff --git a/rules/null_island.go b/rules/null_island.go index d345879e..0326c18e 100644 --- a/rules/null_island.go +++ b/rules/null_island.go @@ -1,8 +1,6 @@ package rules import ( - "fmt" - "github.com/interline-io/transitland-lib/gtfs" "github.com/interline-io/transitland-lib/service" "github.com/interline-io/transitland-lib/tt" @@ -12,7 +10,7 @@ import ( type ZeroCoordinateError struct{ bc } func (e *ZeroCoordinateError) Error() string { - return fmt.Sprintf("entity '%s' has coordinates that include (0,0)", e.EntityID) + return "stop has a zero coordinate" } // NullIslandCheck checks for ZeroCoordinateError. @@ -26,13 +24,13 @@ func (e *NullIslandCheck) Validate(ent tt.Entity) []error { return nil // allowed } coords := v.Coordinates() - if coords[0] == 0 && coords[1] == 0 { - return []error{&ZeroCoordinateError{bc: bc{Field: "stop_lat", EntityID: v.StopID.Val, Message: "stop has (0,0) coordinates"}}} + if coords[0] == 0 || coords[1] == 0 { + return []error{&ZeroCoordinateError{bc: bc{Field: "stop_lat", EntityID: v.StopID.Val, Message: "stop has a zero coordinate"}}} } case *service.ShapeLine: for _, coords := range v.Geometry.Val.Coords() { - if coords[0] == 0 && coords[1] == 0 { - return []error{&ZeroCoordinateError{bc: bc{Field: "shape_pt_lon", EntityID: v.ShapeID.Val, Message: "shape has (0,0) coordinates"}}} + if coords[0] == 0 || coords[1] == 0 { + return []error{&ZeroCoordinateError{bc: bc{Field: "shape_pt_lon", EntityID: v.ShapeID.Val, Message: "shape has a zero coordinate"}}} } } } diff --git a/rules/shape_segment_length.go b/rules/shape_segment_length.go new file mode 100644 index 00000000..ab026175 --- /dev/null +++ b/rules/shape_segment_length.go @@ -0,0 +1,53 @@ +package rules + +import ( + "fmt" + + "github.com/interline-io/transitland-lib/service" + "github.com/interline-io/transitland-lib/tlxy" + "github.com/interline-io/transitland-lib/tt" +) + +type ShapeSegmentLengthError struct { + MaxAllowedDistance float64 + MaxDistance float64 + bc +} + +func (e ShapeSegmentLengthError) Error() string { + return fmt.Sprintf("shape segment length exceeds maximum allowed distance: %f > %f", e.MaxDistance, e.MaxAllowedDistance) +} + +type ShapeMaxSegmentLengthCheck struct { + MaxAllowedDistance float64 +} + +func (e *ShapeMaxSegmentLengthCheck) Validate(ent tt.Entity) []error { + if e.MaxAllowedDistance <= 0 { + return nil + } + v, ok := ent.(*service.ShapeLine) + if !ok { + return nil + } + var errs []error + maxLength := 0.0 + pts := v.Geometry.ToPoints() + if len(pts) < 2 { + return nil + } + lastPt := pts[0] + for _, pt := range pts { + d := tlxy.DistanceHaversine(lastPt, pt) + if d > maxLength { + maxLength = d + } + } + if maxLength > e.MaxAllowedDistance { + errs = append(errs, ShapeSegmentLengthError{ + MaxAllowedDistance: e.MaxAllowedDistance, + MaxDistance: maxLength, + }) + } + return errs +} diff --git a/validator/validator.go b/validator/validator.go index fa2000aa..afcb4fd4 100644 --- a/validator/validator.go +++ b/validator/validator.go @@ -353,7 +353,6 @@ func (v *Validator) copierOptions() copier.Options { cpOpts.AllowEntityErrors = true cpOpts.AllowReferenceErrors = true cpOpts.AddExtensionWithLevel(v.rtValidator, 1) - // Best practices extension if v.Options.BestPractices { cpOpts.AddExtensionWithLevel(&rules.NoScheduledServiceCheck{}, 1) @@ -376,6 +375,9 @@ func (v *Validator) copierOptions() copier.Options { cpOpts.AddExtensionWithLevel(&rules.MinTransferTimeCheck{}, 1) cpOpts.AddExtensionWithLevel(&rules.RouteNamesPrefixCheck{}, 1) cpOpts.AddExtensionWithLevel(&rules.RouteNamesCharactersCheck{}, 1) + cpOpts.AddExtensionWithLevel(&rules.ShapeMaxSegmentLengthCheck{ + MaxAllowedDistance: 1_000_000, // 1000 km + }, 1) } return cpOpts }