Here's a Python API solution using the pyvcf submodule I wrote:

>>> from fuc import pyvcf
>>> data = {
...     'CHROM': ['chr1', 'chr1', 'chr1', 'chr1', 'chr1'],
...     'POS': [100, 101, 102, 103, 104],
...     'ID': ['.', '.', '.', '.', '.'],
...     'REF': ['A', 'A', 'C', 'C', 'T'],
...     'ALT': ['C,T', 'T', 'G', 'G,A', 'A'],
...     'QUAL': ['.', '.', '.', '.', '.'],
...     'FILTER': ['.', '.', '.', '.', '.'],
...     'INFO': ['.', '.', '.', '.', '.'],
...     'FORMAT': ['GT', 'GT', 'GT', 'GT', 'GT'],
...     'A': ['0/2', '0/0', '0/1', './.', '0/1'],
...     'B': ['0/1', '1/1', './.', '1/2', '1/1'],
... }
>>> vf = pyvcf.VcfFrame.from_dict([], data)
>>> # vf = pyvcf.VcfFrame.from_file('in.vcf')
>>> vf.df
  CHROM  POS ID REF  ALT QUAL FILTER INFO FORMAT    A    B
0  chr1  100  .   A  C,T    .      .    .     GT  0/2  0/1
1  chr1  101  .   A    T    .      .    .     GT  0/0  1/1
2  chr1  102  .   C    G    .      .    .     GT  0/1  ./.
3  chr1  103  .   C  G,A    .      .    .     GT  ./.  1/2
4  chr1  104  .   T    A    .      .    .     GT  0/1  1/1
>>> # Remove multiallelic variants
>>> vf = vf.filter_multialt()
>>> vf.df
  CHROM  POS ID REF ALT QUAL FILTER INFO FORMAT    A    B
0  chr1  101  .   A   T    .      .    .     GT  0/0  1/1
1  chr1  102  .   C   G    .      .    .     GT  0/1  ./.
2  chr1  104  .   T   A    .      .    .     GT  0/1  1/1
>>> # Select only variants where the sample B is homozygous
>>> def one_row(r):
...     return r['B'].split("https://www.biostars.org/")[0] == r['B'].split("https://www.biostars.org/")[1]
... 
>>> i = vf.df.apply(one_row, axis=1)
>>> vf.df = vf.df[i]
>>> vf.df
  CHROM  POS ID REF ALT QUAL FILTER INFO FORMAT    A    B
0  chr1  101  .   A   T    .      .    .     GT  0/0  1/1
1  chr1  102  .   C   G    .      .    .     GT  0/1  ./.
2  chr1  104  .   T   A    .      .    .     GT  0/1  1/1
>>> vf.to_file('out.vcf')



Source link