'Python ThreadPoolExecutor is faster than a loop for CPU-bound task. How come?
Recently I've been working on a project, and found behaviour that I don't understand. We have endpoint that fetches documents from mongodb and then applies transformation to each document, replacing some symbols in data using regex. And what's bothering me is that for 7400 documents applying transformation function in a regular loop takes 6 seconds to finish. And using ThreadPoolExecutor.map
finishes in 3.4 seconds. As far as I know GIL
prevents python interpreter from running more that one thread simultaneously, so for CPU-bound tasks should run slower in ThreadPoolExecutor
than in regular loop. But this is not a case here. How so? I'm assuming that re
operations somehow release GIL
but not sure. Here is code:
# parts that are responsible for substituting symbols in mongo document
class Converter:
def __init__(self, reverse=False):
self.reverse = reverse
def key_convert(self, key, reverse, path):
return key
def value_convert(self, value, reverse, path):
return value
def recurse(self, data, path=tuple()):
if isinstance(data, Mapping):
_data = {}
for k, v in data.items():
next_path = path + (k,)
key = self.key_convert(k, self.reverse, path)
value = self.value_convert(v, self.reverse, next_path)
_data[key] = self.recurse(value, next_path)
return _data
elif isinstance(data, Iterable) and not isinstance(data, (str, bytes)):
return [
self.recurse(it, path + (idx,))
for idx, it in enumerate(data)]
return self.value_convert(data, self.reverse, path)
def convert(self, data):
self.reverse = False
return self.recurse(data)
def unconvert(self, data):
self.reverse = True
return self.recurse(data)
class MongoConverter(Converter):
_to_mongo = ((rec(r"\."), "\u00B7"), (rec(r"^\$"), "#"),)
_from_mongo = ((rec("\u00B7"), "."), (rec("^#"), "$"),)
def key_convert_to_mongo(self, key):
return pattern_substitute(key, self._to_mongo)
def key_convert_from_mongo(self, key):
return pattern_substitute(key, self._from_mongo)
def key_convert(self, key, reverse, _):
if reverse:
return self.key_convert_from_mongo(key)
return self.key_convert_to_mongo(key)
def value_convert(self, value, reverse, path):
if reverse:
return value
return as_datetime(value)
def pattern_substitute(value, pattern_substitutes):
for pattern, substitute in pattern_substitutes:
value = pattern.sub(substitute, value)
return value
# storage adapter
class MongoStorage:
def __init__(self, collection, converter=None):
self.collection = collection
self.converter = converter if converter else MongoConverter()
self._context = None
self.executor = ThreadPoolExecutor()
def after_find(self, data):
if data is not None:
return self.converter.unconvert(data)
def find(self, filtr=None, limit=-1, **kwargs):
filtr = filter_converter.convert(filtr)
if limit == 0:
return []
if limit == -1:
limit = 0
# this part is what I'm asking about. Regular loop here is slower
return self.executor.map(
self.after_find,
self.collection.find(filtr, limit=limit, **kwargs)
)
Thank you for answers.
Sources
This article follows the attribution requirements of Stack Overflow and is licensed under CC BY-SA 3.0.
Source: Stack Overflow
Solution | Source |
---|