Experimental Physics and
Industrial Control System

1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 <2013> 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025	Index	1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 <2013> 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025
<== Date ==>		<== Thread ==>

Subject:	RE: [Bug 1179642] Re: race condition when destroying subscripion in preemptive callback mode application
From:	"Hill, Jeff" <[email protected]>
To:	"[email protected]" <[email protected]>
Cc:	"Tech Talk \([email protected]\)" <[email protected]>
Date:	Thu, 16 May 2013 01:07:30 +0000

Hi Michael,

Attached is the acctst.c regression test I added, based on your test code. 

I have pushed this change back to acctst.c in Launchpad.

I have also written a fix which is passing regression tests. I will probably also push that up in the morning if I don’t see any issues when I review it with fresh eyes. 

Jeff

#define multiSubscrDestroyNoLateCallbackEventCount 500

struct MultiSubscrDestroyNoLateCallbackEventData {
    evid m_id;
    size_t m_nCallback;
    int m_callbackIsOk;
    struct MultiSubscrDestroyNoLateCallbackTestData * m_pTestData;
};

struct MultiSubscrDestroyNoLateCallbackTestData {
    const char * m_pChanName;
    chid m_chan;
    epicsMutexId m_mutex;
    epicsEventId m_testDoneEvent;
    unsigned m_interestLevel;
    struct MultiSubscrDestroyNoLateCallbackEventData
        m_eventData [multiSubscrDestroyNoLateCallbackEventCount];
};

static void noLateCallbackDetect ( struct event_handler_args args )
{
    int callbackIsOk;
    struct MultiSubscrDestroyNoLateCallbackEventData * const pEventData = args.usr;
    epicsMutexLockStatus lockStatus = epicsMutexLock ( pEventData->m_pTestData->m_mutex );
    callbackIsOk = pEventData->m_callbackIsOk;
    pEventData->m_nCallback++;
    epicsMutexUnlock ( pEventData->m_pTestData->m_mutex );
    verify ( lockStatus == epicsMutexLockOK );
    verify ( callbackIsOk );
}

static void multiSubscrDestroyNoLateCallbackThread ( void * pParm ) 
{
    struct MultiSubscrDestroyNoLateCallbackTestData * const pTestData = 
              ( struct MultiSubscrDestroyNoLateCallbackTestData * ) pParm;
    unsigned i, j;
    int status;

    status = ca_context_create ( ca_enable_preemptive_callback );
    verify ( status == ECA_NORMAL );

    status = ca_create_channel ( pTestData->m_pChanName, 0, 0,
                           CA_PRIORITY_DEFAULT, &pTestData->m_chan );
    status = ca_pend_io ( timeoutToPendIO );
    SEVCHK ( status, "multiSubscrDestroyLateNoCallbackTest: channel connect failed" );
    verify ( status == ECA_NORMAL );

    /*
     * create a set of subscriptions
     */
    for ( i=0; i < 10000; i++ ) {
        unsigned int priorityOfTestThread;
        for ( j=0; j < multiSubscrDestroyNoLateCallbackEventCount; j++ ) {
            epicsMutexLockStatus lockStatus = epicsMutexLock ( pTestData->m_mutex );
            verify ( lockStatus == epicsMutexLockOK );
            pTestData->m_eventData[j].m_nCallback = 0;
            pTestData->m_eventData[j].m_callbackIsOk = TRUE;
            pTestData->m_eventData[j].m_pTestData = pTestData;
            epicsMutexUnlock ( pTestData->m_mutex );
            SEVCHK ( ca_add_event ( DBR_GR_FLOAT, pTestData->m_chan, noLateCallbackDetect,
                        &pTestData->m_eventData[j], &pTestData->m_eventData[j].m_id ) , NULL );
        }
        SEVCHK ( ca_flush_io(), NULL ); 

        /*
         * raise the priority of the current thread hoping to improve our 
         * likelyhood of detecting a bug
         */
        priorityOfTestThread = epicsThreadGetPrioritySelf ();
        epicsThreadSetPriority ( epicsThreadGetIdSelf(), epicsThreadPriorityHigh );


        /*
         * wait for the first subscription update to arrive
         */
        {
            epicsMutexLockStatus lockStatus = epicsMutexLock ( pTestData->m_mutex );
            verify ( lockStatus == epicsMutexLockOK );
            while ( pTestData->m_eventData[0].m_nCallback == 0 ) {
                epicsMutexUnlock ( pTestData->m_mutex );
                epicsThreadSleep ( 50e-6 );
                lockStatus = epicsMutexLock ( pTestData->m_mutex );
                verify ( lockStatus == epicsMutexLockOK );
            }
            epicsMutexUnlock ( pTestData->m_mutex );
        }
        /*
         * try to destroy all of the subscriptions at precisely the same time that
         * their first callbacks are running
         */
        for ( j=0; j < multiSubscrDestroyNoLateCallbackEventCount; j++ ) {
            SEVCHK ( ca_clear_event ( pTestData->m_eventData[j].m_id ) , NULL );
            epicsMutexLockStatus lockStatus = epicsMutexLock ( pTestData->m_mutex );
            verify ( lockStatus == epicsMutexLockOK );
            pTestData->m_eventData[j].m_callbackIsOk = FALSE;
            epicsMutexUnlock ( pTestData->m_mutex );
        }
        /*
         * return to the original priority
         */
        epicsThreadSetPriority ( epicsThreadGetIdSelf(), priorityOfTestThread );

        if ( i % 1000 == 0 ) {
            showProgress ( pTestData->m_interestLevel );
        }
    }

    SEVCHK ( ca_clear_channel ( pTestData->m_chan ), NULL );

    ca_context_destroy ();

    epicsEventMustTrigger ( pTestData->m_testDoneEvent );
}

/*
 * verify that, in a preemtive callback mode client, a subscription callback never 
 * comes after the subscription is destroyed
 */
static void multiSubscrDestroyNoLateCallbackTest ( const char *pName, unsigned interestLevel )
{
    struct MultiSubscrDestroyNoLateCallbackTestData * pTestData;

    showProgressBegin ( "multiSubscrDestroyNoLateCallbackTest", interestLevel );

    pTestData = calloc ( 1u, sizeof ( struct MultiSubscrDestroyNoLateCallbackTestData ) );
    verify ( pTestData );
    pTestData->m_mutex = epicsMutexMustCreate (); 
    pTestData->m_testDoneEvent = epicsEventMustCreate ( epicsEventEmpty );
    pTestData->m_pChanName = pName;
    pTestData->m_interestLevel = interestLevel;
    epicsThreadMustCreate (
                      "multiSubscrDestroyNoLateCallbackTest",
                      epicsThreadPriorityLow, 
                      epicsThreadGetStackSize ( epicsThreadStackMedium ),
                      multiSubscrDestroyNoLateCallbackThread,
                      pTestData );

    /*
     * wait for test to complete
     */
    epicsEventMustWait ( pTestData->m_testDoneEvent );

    /*
     * cleanup
     */
    epicsMutexDestroy ( pTestData->m_mutex );
    epicsEventDestroy ( pTestData->m_testDoneEvent );
    free ( pTestData );

    showProgressEnd ( interestLevel );
}

> -----Original Message-----
> From: [email protected] [mailto:[email protected]] On Behalf
> Of Michael Abbott
> Sent: Tuesday, May 14, 2013 12:25 AM
> To: Hill, Jeff
> Subject: [Bug 1179642] Re: race condition when destroying subscripion in
> preemptive callback mode application
> 
> Here's the attachment from the original e-mail.
> 
> ** Attachment added: "Test IOC to demonstrate race condition reported
> here"
>    https://bugs.launchpad.net/epics-
> base/+bug/1179642/+attachment/3675984/+files/test.tgz
> 
> --
> You received this bug notification because you are subscribed to the bug
> report.
> https://bugs.launchpad.net/bugs/1179642
> 
> Title:
>   race condition when destroying subscripion in preemptive callback mode
>   application
> 
> Status in EPICS Base:
>   Confirmed
> 
> Bug description:
>   From Michael Abbot
> 
>   see http://www.aps.anl.gov/epics/tech-talk/2012/msg00584.php
> 
>   > > In the attached test IOC I repeatedly create 500 subscriptions to 500
>   > > locally published PVs, pause a few hundred microseconds, and then
>   > > proceed to tear them all down again.  The context pointer I pass
>   > > (args.usr) just contains a validity flag which I reset after
>   > > ca_clear_subscription returns -- and which I test in the callback.
>   > >
>   > > Below is a typical run:
>   > >
>   > > $ ./test 10 500
>   > > dbLoadDatabase("dbd/TEST.dbd", NULL, NULL)
>   > > TEST_registerRecordDeviceDriver(pdbbase)
>   > > dbLoadRecords("db/TEST.db", NULL)
>   > > iocInit()
>   > > Starting iocInit
>   > >
>   >
> ###############################################################
>   > ########
>   > > #####
>   > > ## EPICS R3.14.11 $R3-14-11$ $2009/08/28 18:47:36$
>   > > ## EPICS Base built Nov  4 2011
>   > >
>   >
> ###############################################################
>   > ########
>   > > #####
>   > > iocRun: All initialization complete
>   > > All channels connected
>   > > Testing 10 cycles, interval 500 us
>   > > [......................................................................
>   > > .......................................................................
>   > > .......................................................................
>   > > .......................................................................
>   > > .......................................................................
>   > > .......................................................................
>   > > ...............................................................whoops!
>   > > ][
>   > >
>   > >
>   > > The two arguments to `test` are number of times to try and how long to
>   > > pause between create and clear (in microseconds, passed to usleep(3)).
>   > > [ and ] are printed at the start and end of a cycle (so [ is
>   > > immediately followed by a burst of ca_create_subscription() calls) and
>   > > each . represents a successful callback.  An unsuccessful (invalid)
>   > > callback is shown by 'whoops!' which is followed by an exit() call.
>   > >
>   > > This test can be very delicate and difficult to reproduce, and may need
>   > > to be run many times with slightly different pause intervals before
>   > > being even partially repeatable -- the fault only appears to show when
>   > > there isn't time for all 500 PVs to complete their initial updates, but
>   > > there has to be enough time for them all to make the effort.
>   > >
>   > > Another interesting detail follows from some locking I'm doing.  Here
>   > > is an extract of the relevant code (LOCK() is just
>   > > pthread_mutex_lock(3p) on a global mutex):
>   > >
>   > > 1	static void on_update(struct event_handler_args)
>   > > 2	{
>   > > 3	    struct event *event = args.usr;
>   > > 4	    LOCK();
>   > > 5	    bool valid = event->valid;
>   > > 6	    UNLOCK();
>   > > 7	    if (valid) ...
>   > > 8	}
>   > >
>   > > 	...
>   > >
>   > > 9	    LOCK();		// This should trigger deadlock
>   > > 10	    ca_clear_subscription(event->event_id);
>   > > 11	    event->valid = false;
>   > > 12	    UNLOCK();
>   > >
>   > > It seems to me that if ca_clear_subscription() is correctly doing what
>   > > we discussed a year ago, which is to say, if it is waiting for all
>   > > outstanding callbacks to complete before returning, then the LOCK() on
>   > > line 9 should trigger a deadlock when ca_clear_subscription() is called
>   > > with its associated callback still only on line 3 (or earlier).  But I
>   > > never see my test deadlock.
>   > >
>   > > I'm seeing this problem occur on test code which is repeatedly creating
>   > > and destroying subscriptions, but I've previously reported this on CA
>   > > client shutdown, so it does look to me like there is a general
>   > > synchronisation problem here.  I believe I have a workaround, which is
>   > > to delay releasing the callback context to give time for outstanding
>   > > callbacks to complete, but this is a bit worrysome...
>   >
>   >
> 
> To manage notifications about this bug go to:
> https://bugs.launchpad.net/epics-base/+bug/1179642/+subscriptions

Navigate by Date:: Prev: RES: Open multiple OPI Lucas Jose Monteiro Carvalho; Next: Re: How to output arrary data in EPICS db? Dirk Zimoch; Index: 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 <2013> 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025
Navigate by Thread:: Prev: RES: Open multiple OPI Lucas Jose Monteiro Carvalho; Next: bug in xxx_registerRecordDeviceDriver.cpp on Windows Dirk Zimoch; Index: 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 <2013> 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025

ANJ, 20 Apr 2015

· Home · News · About · Base · Modules · Extensions · Distributions ·
· Download · Search · IRMIS · Talk · Documents · Links · Licensing ·

Experimental Physics and Industrial Control System

Experimental Physics and
Industrial Control System