Skip to content

Commit 4e78e03

Browse files
committedSep 7, 2017
[FEATURE] Improve Stats by Categories algorithm
- allow non spatial inputs - allow calculation of stats on any field type, with specific string and datetime stats calculated when field type matches - output a full set of stats for numeric fields (including median , quartiles, etc) - also calculate stats for 'null' category
1 parent 20d8244 commit 4e78e03

File tree

1 file changed

+166
-26
lines changed

1 file changed

+166
-26
lines changed
 

‎python/plugins/processing/algs/qgis/StatisticsByCategories.py

100644100755
Lines changed: 166 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -28,6 +28,8 @@
2828

2929
from qgis.core import (QgsProcessingParameterFeatureSource,
3030
QgsStatisticalSummary,
31+
QgsDateTimeStatisticalSummary,
32+
QgsStringStatisticalSummary,
3133
QgsFeatureRequest,
3234
QgsProcessingParameterField,
3335
QgsProcessingParameterFeatureSink,
@@ -36,13 +38,16 @@
3638
QgsWkbTypes,
3739
QgsCoordinateReferenceSystem,
3840
QgsFeature,
39-
QgsFeatureSink)
41+
QgsFeatureSink,
42+
QgsProcessing,
43+
NULL)
4044
from qgis.PyQt.QtCore import QVariant
4145
from processing.algs.qgis.QgisAlgorithm import QgisAlgorithm
4246

47+
from collections import defaultdict
4348

44-
class StatisticsByCategories(QgisAlgorithm):
4549

50+
class StatisticsByCategories(QgisAlgorithm):
4651
INPUT = 'INPUT'
4752
VALUES_FIELD_NAME = 'VALUES_FIELD_NAME'
4853
CATEGORIES_FIELD_NAME = 'CATEGORIES_FIELD_NAME'
@@ -56,13 +61,15 @@ def __init__(self):
5661

5762
def initAlgorithm(self, config=None):
5863
self.addParameter(QgsProcessingParameterFeatureSource(self.INPUT,
59-
self.tr('Input vector layer')))
64+
self.tr('Input vector layer'),
65+
types=[QgsProcessing.TypeVector]))
6066
self.addParameter(QgsProcessingParameterField(self.VALUES_FIELD_NAME,
6167
self.tr('Field to calculate statistics on'),
62-
parentLayerParameterName=self.INPUT, type=QgsProcessingParameterField.Numeric))
68+
parentLayerParameterName=self.INPUT))
6369
self.addParameter(QgsProcessingParameterField(self.CATEGORIES_FIELD_NAME,
6470
self.tr('Field with categories'),
65-
parentLayerParameterName=self.INPUT, type=QgsProcessingParameterField.Any))
71+
parentLayerParameterName=self.INPUT,
72+
type=QgsProcessingParameterField.Any))
6673

6774
self.addParameter(QgsProcessingParameterFeatureSink(self.OUTPUT, self.tr('Statistics by category')))
6875

@@ -78,46 +85,179 @@ def processAlgorithm(self, parameters, context, feedback):
7885
category_field_name = self.parameterAsString(parameters, self.CATEGORIES_FIELD_NAME, context)
7986

8087
value_field_index = source.fields().lookupField(value_field_name)
88+
value_field = source.fields().at(value_field_index)
8189
category_field_index = source.fields().lookupField(category_field_name)
8290

83-
features = source.getFeatures(QgsFeatureRequest().setFlags(QgsFeatureRequest.NoGeometry))
84-
total = 100.0 / source.featureCount() if source.featureCount() else 0
85-
values = {}
91+
# generate output fields
92+
fields = QgsFields()
93+
fields.append(source.fields().at(category_field_index))
94+
95+
def addField(name):
96+
"""
97+
Adds a field to the output, keeping the same data type as the value_field
98+
"""
99+
field = value_field
100+
field.setName(name)
101+
fields.append(field)
102+
103+
if value_field.isNumeric():
104+
field_type = 'numeric'
105+
fields.append(QgsField('count', QVariant.Int))
106+
fields.append(QgsField('unique', QVariant.Int))
107+
fields.append(QgsField('min', QVariant.Double))
108+
fields.append(QgsField('max', QVariant.Double))
109+
fields.append(QgsField('range', QVariant.Double))
110+
fields.append(QgsField('sum', QVariant.Double))
111+
fields.append(QgsField('mean', QVariant.Double))
112+
fields.append(QgsField('median', QVariant.Double))
113+
fields.append(QgsField('stddev', QVariant.Double))
114+
fields.append(QgsField('minority', QVariant.Double))
115+
fields.append(QgsField('majority', QVariant.Double))
116+
fields.append(QgsField('q1', QVariant.Double))
117+
fields.append(QgsField('q3', QVariant.Double))
118+
fields.append(QgsField('iqr', QVariant.Double))
119+
elif value_field.type() in (QVariant.Date, QVariant.Time, QVariant.DateTime):
120+
field_type = 'datetime'
121+
fields.append(QgsField('count', QVariant.Int))
122+
fields.append(QgsField('unique', QVariant.Int))
123+
fields.append(QgsField('empty', QVariant.Int))
124+
fields.append(QgsField('filled', QVariant.Int))
125+
# keep same data type for these fields
126+
addField('min')
127+
addField('max')
128+
else:
129+
field_type = 'string'
130+
fields.append(QgsField('count', QVariant.Int))
131+
fields.append(QgsField('unique', QVariant.Int))
132+
fields.append(QgsField('empty', QVariant.Int))
133+
fields.append(QgsField('filled', QVariant.Int))
134+
# keep same data type for these fields
135+
addField('min')
136+
addField('max')
137+
fields.append(QgsField('min_length', QVariant.Int))
138+
fields.append(QgsField('max_length', QVariant.Int))
139+
fields.append(QgsField('mean_length', QVariant.Double))
140+
141+
features = source.getFeatures(QgsFeatureRequest().setFlags(QgsFeatureRequest.NoGeometry).setSubsetOfAttributes(
142+
[value_field_index, category_field_index]))
143+
total = 50.0 / source.featureCount() if source.featureCount() else 0
144+
values = defaultdict(list)
86145
for current, feat in enumerate(features):
87146
if feedback.isCanceled():
88147
break
89148

90149
feedback.setProgress(int(current * total))
91150
attrs = feat.attributes()
92151
try:
93-
value = float(attrs[value_field_index])
152+
if field_type == 'numeric':
153+
if attrs[value_field_index] == NULL:
154+
continue
155+
else:
156+
value = float(attrs[value_field_index])
157+
elif attrs[value_field_index] == NULL:
158+
value = NULL
159+
elif field_type == 'string':
160+
value = str(attrs[value_field_index])
161+
else:
162+
value = attrs[value_field_index]
94163
cat = attrs[category_field_index]
95-
if cat not in values:
96-
values[cat] = []
97164
values[cat].append(value)
98165
except:
99166
pass
100167

101-
fields = QgsFields()
102-
fields.append(source.fields().at(category_field_index))
103-
fields.append(QgsField('min', QVariant.Double))
104-
fields.append(QgsField('max', QVariant.Double))
105-
fields.append(QgsField('mean', QVariant.Double))
106-
fields.append(QgsField('stddev', QVariant.Double))
107-
fields.append(QgsField('sum', QVariant.Double))
108-
fields.append(QgsField('count', QVariant.Int))
109-
110168
(sink, dest_id) = self.parameterAsSink(parameters, self.OUTPUT, context,
111169
fields, QgsWkbTypes.NoGeometry, QgsCoordinateReferenceSystem())
112170

113-
stat = QgsStatisticalSummary(QgsStatisticalSummary.Min | QgsStatisticalSummary.Max |
114-
QgsStatisticalSummary.Mean | QgsStatisticalSummary.StDevSample |
115-
QgsStatisticalSummary.Sum | QgsStatisticalSummary.Count)
171+
if field_type == 'numeric':
172+
self.calcNumericStats(values, sink, feedback)
173+
elif field_type == 'datetime':
174+
self.calcDateTimeStats(values, sink, feedback)
175+
else:
176+
self.calcStringStats(values, sink, feedback)
177+
178+
return {self.OUTPUT: dest_id}
179+
180+
def calcNumericStats(self, values, sink, feedback):
181+
stat = QgsStatisticalSummary()
182+
183+
total = 50.0 / len(values) if values else 0
184+
current = 0
185+
for cat, v in values.items():
186+
if feedback.isCanceled():
187+
break
188+
189+
feedback.setProgress(int(current * total) + 50)
116190

117-
for (cat, v) in list(values.items()):
118191
stat.calculate(v)
119192
f = QgsFeature()
120-
f.setAttributes([cat, stat.min(), stat.max(), stat.mean(), stat.sampleStDev(), stat.sum(), stat.count()])
193+
f.setAttributes([cat,
194+
stat.count(),
195+
stat.variety(),
196+
stat.min(),
197+
stat.max(),
198+
stat.range(),
199+
stat.sum(),
200+
stat.mean(),
201+
stat.median(),
202+
stat.stDev(),
203+
stat.minority(),
204+
stat.majority(),
205+
stat.firstQuartile(),
206+
stat.thirdQuartile(),
207+
stat.interQuartileRange()])
208+
121209
sink.addFeature(f, QgsFeatureSink.FastInsert)
210+
current += 1
122211

123-
return {self.OUTPUT: dest_id}
212+
def calcDateTimeStats(self, values, sink, feedback):
213+
stat = QgsDateTimeStatisticalSummary()
214+
215+
total = 50.0 / len(values) if values else 0
216+
current = 0
217+
for cat, v in values.items():
218+
if feedback.isCanceled():
219+
break
220+
221+
feedback.setProgress(int(current * total) + 50)
222+
223+
stat.calculate(v)
224+
f = QgsFeature()
225+
f.setAttributes([cat,
226+
stat.count(),
227+
stat.countDistinct(),
228+
stat.countMissing(),
229+
stat.count() - stat.countMissing(),
230+
stat.statistic(QgsDateTimeStatisticalSummary.Min),
231+
stat.statistic(QgsDateTimeStatisticalSummary.Max)
232+
])
233+
234+
sink.addFeature(f, QgsFeatureSink.FastInsert)
235+
current += 1
236+
237+
def calcStringStats(self, values, sink, feedback):
238+
stat = QgsStringStatisticalSummary()
239+
240+
total = 50.0 / len(values) if values else 0
241+
current = 0
242+
for cat, v in values.items():
243+
if feedback.isCanceled():
244+
break
245+
246+
feedback.setProgress(int(current * total) + 50)
247+
248+
stat.calculate(v)
249+
f = QgsFeature()
250+
f.setAttributes([cat,
251+
stat.count(),
252+
stat.countDistinct(),
253+
stat.countMissing(),
254+
stat.count() - stat.countMissing(),
255+
stat.min(),
256+
stat.max(),
257+
stat.minLength(),
258+
stat.maxLength(),
259+
stat.meanLength()
260+
])
261+
262+
sink.addFeature(f, QgsFeatureSink.FastInsert)
263+
current += 1

0 commit comments

Comments
 (0)
Please sign in to comment.