Skip to content

Commit

Permalink
[processing] Fix inefficiencies in Delete Duplicate Geometries algorithm
Browse files Browse the repository at this point in the history
..and make progress bar more accurate.

Use a spatial index to avoid comparing every feature to every other
feature, and only compare against features with intersecting bounding
boxes instead. Also optimise feature requests and loop logic.

Benchmarks:

Point layer, 6000k features

Before: 30 seconds
After: 0.15 seconds

Point layer, 45k features

Before: > 10 minutes
After: 7 seconds

Fixes #19973
  • Loading branch information
nyalldawson committed Sep 28, 2018
1 parent 6110931 commit 9698444
Showing 1 changed file with 36 additions and 11 deletions.
47 changes: 36 additions & 11 deletions python/plugins/processing/algs/qgis/DeleteDuplicateGeometries.py
Expand Up @@ -70,6 +70,7 @@ def processAlgorithm(self, parameters, context, feedback):
raise QgsProcessingException(self.invalidSinkError(parameters, self.OUTPUT))

features = source.getFeatures(QgsFeatureRequest().setSubsetOfAttributes([]))

total = 100.0 / source.featureCount() if source.featureCount() else 0
geoms = dict()
index = QgsSpatialIndex()
Expand All @@ -78,28 +79,52 @@ def processAlgorithm(self, parameters, context, feedback):
break

geoms[f.id()] = f.geometry()
#index.insertFeature
feedback.setProgress(int(current * total))
index.addFeature(f)

feedback.setProgress(int(0.10 * current * total)) # takes about 10% of time

cleaned = dict(geoms)
# start by assuming everything is unique, and chop away at this list
unique_features = dict(geoms)

for i, g in list(geoms.items()):
current = 0
for feature_id, geometry in geoms.items():
if feedback.isCanceled():
break

for j in list(cleaned.keys()):
if i == j or i not in cleaned:
if feature_id not in unique_features:
# feature was already marked as a duplicate
continue

candidates = index.intersects(geometry.boundingBox())
candidates.remove(feature_id)

for candidate_id in candidates:
if candidate_id not in unique_features:
# candidate already marked as a duplicate (not sure if this is possible,
# since it would mean the current feature would also have to be a duplicate!
# but let's be safe!)
continue
if g.isGeosEqual(cleaned[j]):
del cleaned[j]

total = 100.0 / len(cleaned) if cleaned else 1
request = QgsFeatureRequest().setFilterFids(list(cleaned.keys()))
if geometry.isGeosEqual(geoms[candidate_id]):
# candidate is a duplicate of feature
del unique_features[candidate_id]

current += 1
feedback.setProgress(int(0.80 * current * total) + 10) # takes about 80% of time

total = 100.0 / len(unique_features) if unique_features else 1

# now, fetch all the feature attributes for the unique features only
# be super-smart and don't re-fetch geometries
request = QgsFeatureRequest().setFilterFids(list(unique_features.keys())).setFlags(QgsFeatureRequest.NoGeometry)
for current, f in enumerate(source.getFeatures(request)):
if feedback.isCanceled():
break

# use already fetched geometry
f.setGeometry(unique_features[f.id()])
sink.addFeature(f, QgsFeatureSink.FastInsert)
feedback.setProgress(int(current * total))

feedback.setProgress(int(0.10 * current * total) + 90) # takes about 10% of time

return {self.OUTPUT: dest_id}

0 comments on commit 9698444

Please sign in to comment.