Lean  $LEAN_TAG$
PandasConverter.DataFrameGenerator.cs
1 /*
2  * QUANTCONNECT.COM - Democratizing Finance, Empowering Individuals.
3  * Lean Algorithmic Trading Engine v2.0. Copyright 2014 QuantConnect Corporation.
4  *
5  * Licensed under the Apache License, Version 2.0 (the "License");
6  * you may not use this file except in compliance with the License.
7  * You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0
8  *
9  * Unless required by applicable law or agreed to in writing, software
10  * distributed under the License is distributed on an "AS IS" BASIS,
11  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12  * See the License for the specific language governing permissions and
13  * limitations under the License.
14 */
15 
16 using Python.Runtime;
17 using QuantConnect.Data;
20 using QuantConnect.Util;
21 using System;
22 using System.Collections;
23 using System.Collections.Generic;
24 using System.Linq;
25 
26 namespace QuantConnect.Python
27 {
28  public partial class PandasConverter
29  {
30  /// <summary>
31  /// Helper class to generate data frames from slices
32  /// </summary>
33  private class DataFrameGenerator
34  {
35  private static readonly string[] MultiBaseDataCollectionDataFrameNames = new[] { "collection_symbol", "time" };
36  private static readonly string[] MultiCanonicalSymbolsDataFrameNames = new[] { "canonical", "time" };
37  private static readonly string[] SingleBaseDataCollectionDataFrameNames = new[] { "time" };
38 
39  private readonly Type _dataType;
40  private readonly bool _requestedTick;
41  private readonly bool _requestedQuoteBar;
42  private readonly bool _requestedTradeBar;
43  private readonly bool _timeAsColumn;
44 
45  /// <summary>
46  /// PandasData instances for each symbol. Does not hold BaseDataCollection instances.
47  /// </summary>
48  private Dictionary<Symbol, PandasData> _pandasData;
49  private List<(Symbol Symbol, DateTime Time, IEnumerable<ISymbolProvider> Data)> _collections;
50 
51  private int _maxLevels;
52  private bool _shouldUseSymbolOnlyIndex;
53  private readonly bool _flatten;
54 
55  protected DataFrameGenerator(Type dataType = null, bool timeAsColumn = false, bool flatten = false)
56  {
57  _dataType = dataType;
58  // if no data type is requested we check all
59  _requestedTick = dataType == null || dataType == typeof(Tick) || dataType == typeof(OpenInterest);
60  _requestedTradeBar = dataType == null || dataType == typeof(TradeBar);
61  _requestedQuoteBar = dataType == null || dataType == typeof(QuoteBar);
62  _timeAsColumn = timeAsColumn;
63  _flatten = flatten;
64  }
65 
66  public DataFrameGenerator(IEnumerable<Slice> slices, bool flatten = false, Type dataType = null)
67  : this(dataType, flatten: flatten)
68  {
69  AddData(slices);
70  }
71 
72  /// <summary>
73  /// Extracts the data from the slices and prepares it for DataFrame generation.
74  /// If the slices contain BaseDataCollection instances, they are added to the collections list for proper handling.
75  /// For the rest of the data, PandasData instances are created for each symbol and the data is added to them for later processing.
76  /// </summary>
77  protected void AddData(IEnumerable<Slice> slices)
78  {
79  HashSet<SecurityIdentifier> addedData = null;
80 
81  foreach (var slice in slices)
82  {
83  foreach (var data in slice.AllData)
84  {
85  if (_flatten && IsCollection(data.GetType()))
86  {
87  AddCollection(data.Symbol, data.EndTime, (data as IEnumerable).Cast<ISymbolProvider>());
88  continue;
89  }
90 
91  var pandasData = GetPandasData(data);
92  if (pandasData.IsCustomData || (_requestedTick && data is Tick))
93  {
94  pandasData.Add(data);
95  }
96  else
97  {
98  if (!_requestedTradeBar && !_requestedQuoteBar && _dataType != null && data.GetType().IsAssignableTo(_dataType))
99  {
100  // support for auxiliary data history requests
101  pandasData.Add(data);
102  continue;
103  }
104 
105  // we add both quote and trade bars for each symbol at the same time, because they share the row in the data frame else it will generate 2 rows per series
106  if (_requestedTradeBar && _requestedQuoteBar)
107  {
108  addedData ??= new();
109  if (!addedData.Add(data.Symbol.ID))
110  {
111  continue;
112  }
113  }
114 
115  // the slice already has the data organized by symbol so let's take advantage of it using Bars/QuoteBars collections
116  QuoteBar quoteBar;
117  var tradeBar = _requestedTradeBar ? data as TradeBar : null;
118  if (tradeBar != null)
119  {
120  slice.QuoteBars.TryGetValue(tradeBar.Symbol, out quoteBar);
121  }
122  else
123  {
124  quoteBar = _requestedQuoteBar ? data as QuoteBar : null;
125  if (quoteBar != null)
126  {
127  slice.Bars.TryGetValue(quoteBar.Symbol, out tradeBar);
128  }
129  }
130  pandasData.Add(tradeBar, quoteBar);
131  }
132  }
133 
134  addedData?.Clear();
135  }
136  }
137 
138  /// <summary>
139  /// Adds a collection of data and prepares it for DataFrame generation.
140  /// If the collection holds BaseDataCollection instances, they are added to the collections list for proper handling.
141  /// For the rest of the data, PandasData instances are created for each symbol and the data is added to them for later processing.
142  /// </summary>
143  protected void AddData<T>(IEnumerable<T> data)
144  where T : ISymbolProvider
145  {
146  var type = typeof(T);
147  var isCollection = IsCollection(type);
148 
149  if (_flatten && isCollection)
150  {
151  foreach (var collection in data)
152  {
153  var baseData = collection as BaseData;
154  var collectionData = collection as IEnumerable;
155  AddCollection(baseData.Symbol, baseData.EndTime, collectionData.Cast<ISymbolProvider>());
156  }
157  }
158  else
159  {
160  Symbol prevSymbol = null;
161  PandasData prevPandasData = null;
162  foreach (var item in data)
163  {
164  var pandasData = prevSymbol != null && item.Symbol == prevSymbol ? prevPandasData : GetPandasData(item);
165  pandasData.Add(item);
166  prevSymbol = item.Symbol;
167  prevPandasData = pandasData;
168  }
169 
170  // Multiple symbols detected, use symbol only indexing for performance reasons
171  if (_pandasData != null && _pandasData.Count > 1)
172  {
173  _shouldUseSymbolOnlyIndex = true;
174  }
175  }
176  }
177 
178  /// <summary>
179  /// Generates the data frame
180  /// </summary>
181  /// <param name="levels">The number of level the index should have. If not provided, it will be inferred from the data</param>
182  /// <param name="sort">Whether to sort the data frames on concatenation</param>
183  /// <param name="filterMissingValueColumns">Whether to filter missing values. See <see cref="PandasData.ToPandasDataFrame(int, bool)"/></param>
184  /// <param name="symbolOnlyIndex">Whether to assume the data has multiple symbols and also one data point per symbol.
185  /// This is used for performance purposes</param>
186  /// <param name="forceMultiValueSymbol">Useful when the data contains points for multiple symbols.
187  /// If false and <paramref name="symbolOnlyIndex"/> is true, it will assume there is a single point for each symbol,
188  /// and will apply performance improvements for the data frame generation.</param>
189  public PyObject GenerateDataFrame(int? levels = null, bool sort = true, bool filterMissingValueColumns = true,
190  bool symbolOnlyIndex = false, bool forceMultiValueSymbol = false)
191  {
192  using var _ = Py.GIL();
193 
194  var pandasDataDataFrames = GetPandasDataDataFrames(levels, filterMissingValueColumns, symbolOnlyIndex, forceMultiValueSymbol).ToList();
195  var collectionsDataFrames = GetCollectionsDataFrames(symbolOnlyIndex, forceMultiValueSymbol).ToList();
196 
197  try
198  {
199  if (collectionsDataFrames.Count == 0)
200  {
201  return ConcatDataFrames(pandasDataDataFrames, sort, dropna: true);
202  }
203 
204  var dataFrames = collectionsDataFrames.Select(x => x.Item3).Concat(pandasDataDataFrames);
205 
206  if (symbolOnlyIndex)
207  {
208  return ConcatDataFrames(dataFrames, sort, dropna: true);
209  }
210  else if (_collections.DistinctBy(x => x.Symbol).Count() > 1)
211  {
212  var keys = collectionsDataFrames
213  .Select(x => new object[] { x.Item1, x.Item2 })
214  .Concat(pandasDataDataFrames.Select(x => new object[] { x, DateTime.MinValue }));
215  var names = _collections.Any(x => x.Symbol.IsCanonical())
216  ? MultiCanonicalSymbolsDataFrameNames
217  : MultiBaseDataCollectionDataFrameNames;
218 
219  return ConcatDataFrames(dataFrames, keys, names, sort, dropna: true);
220  }
221  else
222  {
223  var keys = collectionsDataFrames
224  .Select(x => new object[] { x.Item2 })
225  .Concat(pandasDataDataFrames.Select(x => new object[] { DateTime.MinValue }));
226 
227  return ConcatDataFrames(dataFrames, keys, SingleBaseDataCollectionDataFrameNames, sort, dropna: true);
228  }
229  }
230  finally
231  {
232  foreach (var df in pandasDataDataFrames.Concat(collectionsDataFrames.Select(x => x.Item3)))
233  {
234  df.Dispose();
235  }
236  }
237  }
238 
239  /// <summary>
240  /// Creates the data frames for the data stored in the <see cref="_pandasData"/> dictionary
241  /// </summary>
242  private IEnumerable<PyObject> GetPandasDataDataFrames(int? levels, bool filterMissingValueColumns, bool symbolOnlyIndex, bool forceMultiValueSymbol)
243  {
244  if (_pandasData is null || _pandasData.Count == 0)
245  {
246  yield break;
247  }
248 
249  if (!forceMultiValueSymbol && (symbolOnlyIndex || _shouldUseSymbolOnlyIndex))
250  {
251  yield return PandasData.ToPandasDataFrame(_pandasData.Values, skipTimesColumn: true);
252  yield break;
253  }
254 
255  foreach (var data in _pandasData.Values)
256  {
257  yield return data.ToPandasDataFrame(levels ?? _maxLevels, filterMissingValueColumns);
258  }
259  }
260 
261  /// <summary>
262  /// Generates the data frames for the base data collections
263  /// </summary>
264  private IEnumerable<(Symbol, DateTime, PyObject)> GetCollectionsDataFrames(bool symbolOnlyIndex, bool forceMultiValueSymbol)
265  {
266  if (_collections is null || _collections.Count == 0)
267  {
268  yield break;
269  }
270 
271  foreach (var (symbol, time, data) in _collections.GroupBy(x => x.Symbol).SelectMany(x => x))
272  {
273  var generator = new DataFrameGenerator(_dataType, timeAsColumn: !symbolOnlyIndex, flatten: _flatten);
274  generator.AddData(data);
275  var dataFrame = generator.GenerateDataFrame(symbolOnlyIndex: symbolOnlyIndex, forceMultiValueSymbol: forceMultiValueSymbol);
276 
277  yield return (symbol, time, dataFrame);
278  }
279  }
280 
281  private PandasData GetPandasData(ISymbolProvider data)
282  {
283  _pandasData ??= new();
284  if (!_pandasData.TryGetValue(data.Symbol, out var pandasData))
285  {
286  pandasData = new PandasData(data, _timeAsColumn);
287  _pandasData[data.Symbol] = pandasData;
288  _maxLevels = Math.Max(_maxLevels, pandasData.Levels);
289  }
290 
291  return pandasData;
292  }
293 
294  private void AddCollection(Symbol symbol, DateTime time, IEnumerable<ISymbolProvider> data)
295  {
296  _collections ??= new();
297  _collections.Add((symbol, time, data));
298  }
299 
300  /// <summary>
301  /// Determines whether the type is considered a collection for flattening.
302  /// Any object that is a <see cref="BaseData"/> and implements <see cref="IEnumerable{ISymbolProvider}"/>
303  /// is considered a base data collection.
304  /// This allows detecting collections of cases like <see cref="OptionUniverse"/> (which is a direct subclass of
305  /// <see cref="BaseDataCollection"/>) and <see cref="OptionChain"/>, which is a collection of <see cref="OptionContract"/>
306  /// </summary>
307  private static bool IsCollection(Type type)
308  {
309  return type.IsAssignableTo(typeof(BaseData)) &&
310  type.GetInterfaces().Any(x => x.IsGenericType &&
311  x.GetGenericTypeDefinition().IsAssignableTo(typeof(IEnumerable<>)) &&
312  x.GenericTypeArguments[0].IsAssignableTo(typeof(ISymbolProvider)));
313  }
314  }
315 
316  private class DataFrameGenerator<T> : DataFrameGenerator
317  where T : ISymbolProvider
318  {
319  public DataFrameGenerator(IEnumerable<T> data, bool flatten)
320  : base(flatten: flatten)
321  {
322  AddData(data);
323  }
324  }
325  }
326 }